Spaces:

yuxindu
/

SegVol

Runtime error

App Files Files Community

yuxin commited on Dec 4, 2023

Commit

2af4882

1 Parent(s): 6c27712

init segvol

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

__pycache__/utils.cpython-39.pyc +0 -0
app.py +295 -0
model/LICENSE +21 -0
model/README.md +74 -0
model/__pycache__/inference_cpu.cpython-39.pyc +0 -0
model/asset/model.png +0 -0
model/asset/overview back.png +0 -0
model/asset/overview.png +0 -0
model/config/clip/config.json +157 -0
model/config/clip/special_tokens_map.json +1 -0
model/config/clip/tokenizer.json +0 -0
model/config/clip/tokenizer_config.json +1 -0
model/config/clip/vocab.json +0 -0
model/config/config_demo.json +8 -0
model/data_process/__pycache__/demo_data_process.cpython-39.pyc +0 -0
model/data_process/demo_data_process.py +91 -0
model/inference_cpu.py +171 -0
model/inference_demo.py +219 -0
model/network/__pycache__/model.cpython-39.pyc +0 -0
model/network/model.py +91 -0
model/script/inference_demo.sh +8 -0
model/segment_anything_volumetric/.ipynb_checkpoints/build_sam-checkpoint.py +172 -0
model/segment_anything_volumetric/__init__.py +12 -0
model/segment_anything_volumetric/__pycache__/__init__.cpython-310.pyc +0 -0
model/segment_anything_volumetric/__pycache__/__init__.cpython-39.pyc +0 -0
model/segment_anything_volumetric/__pycache__/automatic_mask_generator.cpython-310.pyc +0 -0
model/segment_anything_volumetric/__pycache__/automatic_mask_generator.cpython-39.pyc +0 -0
model/segment_anything_volumetric/__pycache__/build_sam.cpython-310.pyc +0 -0
model/segment_anything_volumetric/__pycache__/build_sam.cpython-39.pyc +0 -0
model/segment_anything_volumetric/__pycache__/predictor.cpython-310.pyc +0 -0
model/segment_anything_volumetric/__pycache__/predictor.cpython-39.pyc +0 -0
model/segment_anything_volumetric/automatic_mask_generator.py +372 -0
model/segment_anything_volumetric/build_sam.py +111 -0
model/segment_anything_volumetric/modeling/.ipynb_checkpoints/image_encoder_swin-checkpoint.py +709 -0
model/segment_anything_volumetric/modeling/.ipynb_checkpoints/prompt_encoder-checkpoint.py +232 -0
model/segment_anything_volumetric/modeling/__init__.py +11 -0
model/segment_anything_volumetric/modeling/__pycache__/__init__.cpython-310.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/__init__.cpython-39.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/common.cpython-310.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/common.cpython-39.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/image_encoder.cpython-310.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/image_encoder.cpython-39.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/image_encoder_swin.cpython-39.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/mask_decoder.cpython-310.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/mask_decoder.cpython-39.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/prompt_encoder.cpython-310.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/prompt_encoder.cpython-39.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/sam.cpython-310.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/sam.cpython-39.pyc +0 -0
model/segment_anything_volumetric/modeling/__pycache__/transformer.cpython-310.pyc +0 -0

__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (3.85 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import streamlit as st
+from streamlit_drawable_canvas import st_canvas
+from streamlit_image_coordinates import streamlit_image_coordinates
+from model.data_process.demo_data_process import process_ct_gt
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image, ImageDraw
+import monai.transforms as transforms
+from utils import show_points, make_fig, reflect_points_into_model, initial_rectangle, reflect_json_data_to_3D_box, reflect_box_into_model, run
+print('script run')
+#############################################
+# init session_state
+if 'option' not in  st.session_state:
+    st.session_state.option = None
+if 'text_prompt' not in st.session_state:
+    st.session_state.text_prompt = None
+if 'reset_demo_case' not in st.session_state:
+    st.session_state.reset_demo_case = False
+if 'preds_3D' not in st.session_state:
+    st.session_state.preds_3D = None
+if 'data_item' not in st.session_state:
+    st.session_state.data_item = None
+if 'points' not in st.session_state:
+    st.session_state.points = []
+if 'use_text_prompt' not in st.session_state:
+    st.session_state.use_text_prompt = False
+if 'use_point_prompt' not in st.session_state:
+    st.session_state.use_point_prompt = False
+if 'use_box_prompt' not in st.session_state:
+    st.session_state.use_box_prompt = False
+if 'rectangle_3Dbox' not in st.session_state:
+    st.session_state.rectangle_3Dbox = [0,0,0,0,0,0]
+if 'irregular_box' not in st.session_state:
+    st.session_state.irregular_box = False
+if 'running' not in st.session_state:
+    st.session_state.running = False
+if 'transparency' not in st.session_state:
+    st.session_state.transparency = 0.25
+case_list = [
+    'model/asset/Case_image_00001_0000.nii.gz',
+    'cases/FLARE22_Tr_0002_0000.nii.gz',
+    'cases/FLARE22_Tr_0005_0000.nii.gz',
+    'cases/FLARE22_Tr_0034_0000.nii.gz',
+    'cases/FLARE22_Tr_0045_0000.nii.gz'
+]
+#############################################
+#############################################
+# reset functions
+def clear_prompts():
+    st.session_state.points = []
+    st.session_state.rectangle_3Dbox = [0,0,0,0,0,0]
+def reset_demo_case():
+    st.session_state.data_item = None
+    st.session_state.reset_demo_case = True
+    clear_prompts()
+def clear_file():
+    st.session_state.option = None
+    process_ct_gt.clear()
+    reset_demo_case()
+    clear_prompts()
+#############################################
+st.image(Image.open('model/asset/overview back.png'), use_column_width=True)
+# modify demo case here
+demo_type = st.radio(
+        "Demo case source",
+        ["Select", "Upload"],
+        on_change=clear_file
+    )
+if demo_type=="Select":
+    uploaded_file = st.selectbox(
+        "Select a demo case",
+        case_list,
+        index=None,
+        placeholder="Select a demo case...",
+        on_change=reset_demo_case
+    )
+else:
+    uploaded_file = st.file_uploader("Upload demo case(nii.gz)", type='nii.gz', on_change=reset_demo_case)
+st.session_state.option = uploaded_file
+if  st.session_state.option is not None and \
+    st.session_state.reset_demo_case or (st.session_state.data_item is None and st.session_state.option is not None):
+    st.session_state.data_item = process_ct_gt(st.session_state.option)
+    st.session_state.reset_demo_case = False
+    st.session_state.preds_3D = None
+prompt_col1, prompt_col2 = st.columns(2)
+with prompt_col1:
+    st.session_state.use_text_prompt = st.toggle('Sematic prompt')
+    text_prompt_type = st.radio(
+        "Sematic prompt type",
+        ["Predefined", "Custom"],
+        disabled=(not st.session_state.use_text_prompt)
+    )
+    if text_prompt_type == "Predefined":
+        pre_text = st.selectbox(
+            "Predefined anatomical category:",
+            ['liver', 'right kidney', 'spleen', 'pancreas', 'aorta', 'inferior vena cava', 'right adrenal gland', 'left adrenal gland', 'gallbladder', 'esophagus', 'stomach', 'duodenum', 'left kidney'],
+            index=None,
+            disabled=(not st.session_state.use_text_prompt)
+        )
+    else:
+        pre_text = st.text_input('Enter an Anatomical word or phrase:', None, max_chars=20,
+                                                     disabled=(not st.session_state.use_text_prompt))
+    if pre_text is None or len(pre_text) > 0:
+        st.session_state.text_prompt = pre_text
+    else:
+        st.session_state.text_prompt = None
+with prompt_col2:
+    spatial_prompt_on = st.toggle('Spatial prompt', on_change=clear_prompts)
+    spatial_prompt = st.radio(
+        "Spatial prompt type",
+        ["Point prompt", "Box prompt"],
+        on_change=clear_prompts,
+        disabled=(not spatial_prompt_on))
+if spatial_prompt == "Point prompt":
+    st.session_state.use_point_prompt = True
+    st.session_state.use_box_prompt = False
+elif spatial_prompt == "Box prompt":
+    st.session_state.use_box_prompt = True
+    st.session_state.use_point_prompt = False
+else:
+    st.session_state.use_point_prompt = False
+    st.session_state.use_box_prompt = False
+if not spatial_prompt_on:
+    st.session_state.use_point_prompt = False
+    st.session_state.use_box_prompt = False
+if not st.session_state.use_text_prompt:
+    st.session_state.text_prompt = None
+if st.session_state.option is None:
+    st.write('please select demo case first')
+else:
+    image_3D = st.session_state.data_item['z_image'][0].numpy()
+    col_control1, col_control2 = st.columns(2)
+    with col_control1:
+        selected_index_z = st.slider('X-Y view', 0, image_3D.shape[0] - 1, 0, key='xy')
+    with col_control2:
+        selected_index_y = st.slider('X-Z view', 0, image_3D.shape[1] - 1, 0, key='xz')
+        if st.session_state.use_box_prompt:
+            top, bottom = st.select_slider(
+                'Top and bottom of box',
+                options=range(0, 325),
+                value=(0, 324)
+            )
+            st.session_state.rectangle_3Dbox[0] = top
+            st.session_state.rectangle_3Dbox[3] = bottom
+    col_image1, col_image2 = st.columns(2)
+    if st.session_state.preds_3D is not None:
+        st.session_state.transparency = st.slider('Mask opacity', 0.0, 1.0, 0.5)
+    with col_image1:
+        image_z_array = image_3D[selected_index_z]
+        preds_z_array = None
+        if st.session_state.preds_3D is not None:
+            preds_z_array = st.session_state.preds_3D[selected_index_z]
+        image_z = make_fig(image_z_array, preds_z_array, st.session_state.points, selected_index_z, 'xy')
+        if st.session_state.use_point_prompt:
+            value_xy = streamlit_image_coordinates(image_z, width=325)
+            if value_xy is not None:
+                point_ax_xy = (selected_index_z, value_xy['y'], value_xy['x'])
+                if len(st.session_state.points) >= 3:
+                    st.warning('Max point num is 3', icon="⚠️")
+                elif point_ax_xy not in st.session_state.points:
+                    st.session_state.points.append(point_ax_xy)
+                    print('point_ax_xy add rerun')
+                    st.rerun()
+        elif st.session_state.use_box_prompt:
+            canvas_result_xy = st_canvas(
+                fill_color="rgba(255, 165, 0, 0.3)",  # Fixed fill color with some opacity
+                stroke_width=3,
+                stroke_color='#2909F1',
+                background_image=image_z,
+                update_streamlit=True,
+                height=325,
+                width=325,
+                drawing_mode='transform',
+                point_display_radius=0,
+                key="canvas_xy",
+                initial_drawing=initial_rectangle,
+                display_toolbar=True
+            )
+            try:
+                print(canvas_result_xy.json_data['objects'][0]['angle'])
+                if canvas_result_xy.json_data['objects'][0]['angle'] != 0:
+                    st.warning('Rotating is undefined behavior', icon="⚠️")
+                    st.session_state.irregular_box = True
+                else:
+                    st.session_state.irregular_box = False
+                reflect_json_data_to_3D_box(canvas_result_xy.json_data, view='xy')
+            except:
+                print('exception')
+                pass
+        else:
+            st.image(image_z, use_column_width=False)
+    with col_image2:
+        image_y_array = image_3D[:, selected_index_y, :]
+        preds_y_array = None
+        if st.session_state.preds_3D is not None:
+            preds_y_array = st.session_state.preds_3D[:, selected_index_y, :]
+        image_y = make_fig(image_y_array, preds_y_array, st.session_state.points, selected_index_y, 'xz')
+        if st.session_state.use_point_prompt:
+            value_yz = streamlit_image_coordinates(image_y, width=325)
+            if value_yz is not None:
+                point_ax_xz = (value_yz['y'], selected_index_y, value_yz['x'])
+                if len(st.session_state.points) >= 3:
+                    st.warning('Max point num is 3', icon="⚠️")
+                elif point_ax_xz not in st.session_state.points:
+                    st.session_state.points.append(point_ax_xz)
+                    print('point_ax_xz add rerun')
+                    st.rerun()
+        elif st.session_state.use_box_prompt:
+            if st.session_state.rectangle_3Dbox[1] <= selected_index_y and selected_index_y <= st.session_state.rectangle_3Dbox[4]:
+                draw = ImageDraw.Draw(image_y)
+                #rectangle xz view (upper-left and lower-right)
+                rectangle_coords = [(st.session_state.rectangle_3Dbox[2], st.session_state.rectangle_3Dbox[0]),
+                                    (st.session_state.rectangle_3Dbox[5], st.session_state.rectangle_3Dbox[3])]
+                # Draw the rectangle on the image
+                draw.rectangle(rectangle_coords, outline='#2909F1', width=3)
+            st.image(image_y, use_column_width=False)
+        else:
+            st.image(image_y, use_column_width=False)
+col1, col2, col3 = st.columns(3)
+with col1:
+    if st.button("Clear", use_container_width=True,
+                 disabled=(st.session_state.option is None or (len(st.session_state.points)==0 and not st.session_state.use_box_prompt and st.session_state.preds_3D is None))):
+        clear_prompts()
+        st.session_state.preds_3D = None
+        st.rerun()
+with col3:
+    if st.button("Run", type="primary", use_container_width=True,
+            disabled=(
+                st.session_state.data_item is None or
+                (st.session_state.text_prompt is None and len(st.session_state.points) == 0 and st.session_state.use_box_prompt is False) or
+                st.session_state.irregular_box or
+                st.session_state.running
+                )):
+        st.session_state.running = True
+        st.rerun()
+# if len(st.session_state.points) > 0:
+#     st.write(st.session_state.points)
+if st.session_state.running:
+    st.session_state.running = False
+    run()
+    st.rerun()

model/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 BAAI-DCAI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

model/README.md ADDED Viewed

	@@ -0,0 +1,74 @@

+# SegVol: Universal and Interactive Volumetric Medical Image Segmentation
+This repo is the official implementation of [SegVol: Universal and Interactive Volumetric Medical Image Segmentation](https://arxiv.org/abs/2311.13385).
+## News🚀
+(2023.11.24) *You can download weight files of SegVol and ViT(CTs pre-train) [here](https://drive.google.com/drive/folders/1TEJtgctH534Ko5r4i79usJvqmXVuLf54?usp=drive_link).* 🔥
+(2023.11.23) *The brief introduction and instruction have been uploaded.*
+(2023.11.23) *The inference demo code has been uploaded.*
+(2023.11.22) *The first edition of our paper has been uploaded to arXiv.* 📃
+## Introduction
+<img src="https://github.com/BAAI-DCAI/SegVol/blob/main/asset/overview.png" width="60%" height="60%">
+The SegVol is a universal and interactive model for volumetric medical image segmentation. SegVol accepts **point**, **box** and **text** prompt while output volumetric segmentation. By training on 90k unlabeled Computed Tomography (CT) volumes and 6k labeled CTs, this foundation model supports the segmentation of over 200 anatomical categories.
+We will release SegVol's **inference code**, **training code**, **model params** and **ViT pre-training params** (pre-training is performed over 2,000 epochs on 96k  CTs).
+## Usage
+### Requirements
+The [pytorch v1.11.0](https://pytorch.org/get-started/previous-versions/) (or higher virsion) is needed first. Following install key requirements using commands:
+```
+pip install 'monai[all]==0.9.0'
+pip install einops==0.6.1
+pip install transformers==4.18.0
+pip install matplotlib
+```
+### Config and run demo script
+1. You can download the demo case [here](https://drive.google.com/drive/folders/1TEJtgctH534Ko5r4i79usJvqmXVuLf54?usp=drive_link), or download the whole demo dataset  [AbdomenCT-1K](https://github.com/JunMa11/AbdomenCT-1K) and choose any demo case you want.
+2. Please set CT path and Ground Truth path of the case in the [config_demo.json](https://github.com/BAAI-DCAI/SegVol/blob/main/config/config_demo.json).
+3. After that, config the [inference_demo.sh](https://github.com/BAAI-DCAI/SegVol/blob/main/script/inference_demo.sh) file for execution:
+    - `$segvol_ckpt`: the path of SegVol's checkpoint (Download from [here](https://drive.google.com/drive/folders/1TEJtgctH534Ko5r4i79usJvqmXVuLf54?usp=drive_link)).
+    - `$work_dir`: any path of folder you want to save the log files and visualizaion results.
+4. Finally, you can control the **prompt type**, **zoom-in-zoom-out mechanism** and **visualizaion switch** [here](https://github.com/BAAI-DCAI/SegVol/blob/35f3ff9c943a74f630e6948051a1fe21aaba91bc/inference_demo.py#L208C11-L208C11).
+5. Now, just run `bash script/inference_demo.sh` to infer your demo case.
+## Citation
+If you find this repository helpful, please consider citing:
+```
+@misc{du2023segvol,
+      title={SegVol: Universal and Interactive Volumetric Medical Image Segmentation},
+      author={Yuxin Du and Fan Bai and Tiejun Huang and Bo Zhao},
+      year={2023},
+      eprint={2311.13385},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+## Acknowledgement
+Thanks for the following amazing works:
+[HuggingFace](https://huggingface.co/).
+[CLIP](https://github.com/openai/CLIP).
+[MONAI](https://github.com/Project-MONAI/MONAI).
+[Image by brgfx](https://www.freepik.com/free-vector/anatomical-structure-human-bodies_26353260.htm) on Freepik.
+[Image by muammark](https://www.freepik.com/free-vector/people-icon-collection_1157380.htm#query=user&position=2&from_view=search&track=sph) on Freepik.
+[Image by pch.vector](https://www.freepik.com/free-vector/different-phone-hand-gestures-set_9649376.htm#query=Vector%20touch%20screen%20hand%20gestures&position=4&from_view=search&track=ais) on Freepik.
+[Image by starline](https://www.freepik.com/free-vector/set-three-light-bulb-represent-effective-business-idea-concept_37588597.htm#query=idea&position=0&from_view=search&track=sph) on Freepik.

model/__pycache__/inference_cpu.cpython-39.pyc ADDED Viewed

Binary file (4.67 kB). View file

model/asset/model.png ADDED Viewed

model/asset/overview back.png ADDED Viewed

model/asset/overview.png ADDED Viewed

model/config/clip/config.json ADDED Viewed

	@@ -0,0 +1,157 @@

+{
+  "_name_or_path": "openai/clip-vit-base-patch32",
+  "architectures": [
+    "CLIPModel"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 512,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 512,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 2048,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 8,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "projection_dim": 512,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "text_config_dict": null,
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 32,
+    "prefix": null,
+    "projection_dim" : 512,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": false
+  },
+  "vision_config_dict": null
+}

model/config/clip/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": {"content": "<\|startoftext\|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<\|endoftext\|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<\|endoftext\|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": "<\|endoftext\|>"}

model/config/clip/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/config/clip/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|startoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": "<|endoftext|>", "add_prefix_space": false, "errors": "replace", "do_lower_case": true, "name_or_path": "./clip_ViT_B_32/"}

model/config/clip/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/config/config_demo.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "dataset_name": "AbdomenCT-1k",
+    "categories": ["liver", "kidney", "spleen", "pancreas"],
+    "demo_case": {
+        "ct_path": "path/to/Case_image",
+        "gt_path": "path/to/Case_label"
+    }
+}

model/data_process/__pycache__/demo_data_process.cpython-39.pyc ADDED Viewed

Binary file (3.26 kB). View file

model/data_process/demo_data_process.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import numpy as np
+import monai.transforms as transforms
+import streamlit as st
+import tempfile
+class MinMaxNormalization(transforms.Transform):
+    def __call__(self, data):
+        d = dict(data)
+        k = "image"
+        d[k] = d[k] - d[k].min()
+        d[k] = d[k] / np.clip(d[k].max(), a_min=1e-8, a_max=None)
+        return d
+class DimTranspose(transforms.Transform):
+    def __init__(self, keys):
+        self.keys = keys
+    def __call__(self, data):
+        d = dict(data)
+        for key in self.keys:
+            d[key] = np.swapaxes(d[key], -1, -3)
+        return d
+class ForegroundNormalization(transforms.Transform):
+    def __init__(self, keys):
+        self.keys = keys
+    def __call__(self, data):
+        d = dict(data)
+        for key in self.keys:
+            d[key] = self.normalize(d[key])
+        return d
+    def normalize(self, ct_narray):
+        ct_voxel_ndarray = ct_narray.copy()
+        ct_voxel_ndarray = ct_voxel_ndarray.flatten()
+        thred = np.mean(ct_voxel_ndarray)
+        voxel_filtered = ct_voxel_ndarray[(ct_voxel_ndarray > thred)]
+        upper_bound = np.percentile(voxel_filtered, 99.95)
+        lower_bound = np.percentile(voxel_filtered, 00.05)
+        mean = np.mean(voxel_filtered)
+        std = np.std(voxel_filtered)
+        ### transform ###
+        ct_narray = np.clip(ct_narray, lower_bound, upper_bound)
+        ct_narray = (ct_narray - mean) / max(std, 1e-8)
+        return ct_narray
+@st.cache_data
+def process_ct_gt(case_path, spatial_size=(32,256,256)):
+    if case_path is None:
+        return None
+    print('Data preprocessing...')
+    # transform
+    img_loader = transforms.LoadImage(dtype=np.float32)
+    transform = transforms.Compose(
+        [
+            transforms.Orientationd(keys=["image"], axcodes="RAS"),
+            ForegroundNormalization(keys=["image"]),
+            DimTranspose(keys=["image"]),
+            MinMaxNormalization(),
+            transforms.SpatialPadd(keys=["image"], spatial_size=spatial_size, mode='constant'),
+            transforms.CropForegroundd(keys=["image"], source_key="image"),
+            transforms.ToTensord(keys=["image"]),
+        ]
+    )
+    zoom_out_transform = transforms.Resized(keys=["image"], spatial_size=spatial_size, mode='nearest-exact')
+    z_transform = transforms.Resized(keys=["image"], spatial_size=(325,325,325), mode='nearest-exact')
+    ###
+    item = {}
+    # generate ct_voxel_ndarray
+    if type(case_path) is str:
+        ct_voxel_ndarray, _ = img_loader(case_path)
+    else:
+        bytes_data = case_path.read()
+        with tempfile.NamedTemporaryFile(suffix='.nii.gz') as tmp:
+            tmp.write(bytes_data)
+            tmp.seek(0)
+            ct_voxel_ndarray, _ = img_loader(tmp.name)
+    ct_voxel_ndarray = np.array(ct_voxel_ndarray).squeeze()
+    ct_voxel_ndarray = np.expand_dims(ct_voxel_ndarray, axis=0)
+    item['image'] = ct_voxel_ndarray
+    # transform
+    item = transform(item)
+    item_zoom_out = zoom_out_transform(item)
+    item['zoom_out_image'] = item_zoom_out['image']
+    item_z = z_transform(item)
+    item['z_image'] = item_z['image']
+    return item

model/inference_cpu.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import argparse
+import os
+import torch
+import torch.nn.functional as F
+import json
+import monai.transforms as transforms
+from model.segment_anything_volumetric import sam_model_registry
+from model.network.model import SegVol
+from model.data_process.demo_data_process import process_ct_gt
+from model.utils.monai_inferers_utils import sliding_window_inference, generate_box, select_points, build_binary_cube, build_binary_points, logits2roi_coor
+from model.utils.visualize import draw_result
+import streamlit as st
+def set_parse():
+    # %% set up parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test_mode", default=True, type=bool)
+    parser.add_argument("--resume", type = str, default = 'model/asset/SegVol_v1.pth')
+    parser.add_argument("-infer_overlap", default=0.0, type=float, help="sliding window inference overlap")
+    parser.add_argument("-spatial_size", default=(32, 256, 256), type=tuple)
+    parser.add_argument("-patch_size", default=(4, 16, 16), type=tuple)
+    parser.add_argument('-work_dir', type=str, default='./work_dir')
+    ### demo
+    parser.add_argument("--clip_ckpt", type = str, default = 'model/config/clip')
+    args = parser.parse_args()
+    return args
+def zoom_in_zoom_out(args, segvol_model, image, image_resize, text_prompt, point_prompt, box_prompt):
+    image_single_resize = image_resize
+    image_single = image[0,0]
+    ori_shape = image_single.shape
+    # generate prompts
+    text_single = None if text_prompt is None else [text_prompt]
+    points_single = None
+    box_single = None
+    if args.use_point_prompt:
+        point, point_label = point_prompt
+        points_single = (point.unsqueeze(0).float(), point_label.unsqueeze(0).float())
+        binary_points_resize = build_binary_points(point, point_label, ori_shape)
+    if args.use_box_prompt:
+        box_single = box_prompt.unsqueeze(0).float()
+        binary_cube_resize = build_binary_cube(box_single, binary_cube_shape=ori_shape)
+    ####################
+    # zoom-out inference:
+    print('--- zoom out inference ---')
+    print(text_single)
+    print(f'use text-prompt [{text_single!=None}], use box-prompt [{box_single!=None}], use point-prompt [{points_single!=None}]')
+    with torch.no_grad():
+        logits_global_single = segvol_model(image_single_resize,
+                                            text=text_single,
+                                            boxes=box_single,
+                                            points=points_single)
+    # resize back global logits
+    logits_global_single = F.interpolate(
+            logits_global_single.cpu(),
+            size=ori_shape, mode='nearest')[0][0]
+    # build prompt reflection for zoom-in
+    if args.use_point_prompt:
+        binary_points = F.interpolate(
+            binary_points_resize.unsqueeze(0).unsqueeze(0).float(),
+            size=ori_shape, mode='nearest')[0][0]
+    if args.use_box_prompt:
+        binary_cube = F.interpolate(
+            binary_cube_resize.unsqueeze(0).unsqueeze(0).float(),
+            size=ori_shape, mode='nearest')[0][0]
+    # draw_result('unknow', image_single_resize, None, point_prompt, logits_global_single, logits_global_single)
+    if not args.use_zoom_in:
+        return logits_global_single
+    ####################
+    # zoom-in inference:
+    min_d, min_h, min_w, max_d, max_h, max_w = logits2roi_coor(args.spatial_size, logits_global_single)
+    if min_d is None:
+        print('Fail to detect foreground!')
+        return logits_global_single
+    # Crop roi
+    image_single_cropped = image_single[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1].unsqueeze(0).unsqueeze(0)
+    global_preds = (torch.sigmoid(logits_global_single[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1])>0.5).long()
+    assert not (args.use_box_prompt and args.use_point_prompt)
+    # label_single_cropped = label_single[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1].unsqueeze(0).unsqueeze(0)
+    prompt_reflection = None
+    if args.use_box_prompt:
+        binary_cube_cropped = binary_cube[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1]
+        prompt_reflection = (
+            binary_cube_cropped.unsqueeze(0).unsqueeze(0),
+            global_preds.unsqueeze(0).unsqueeze(0)
+        )
+    if args.use_point_prompt:
+        binary_points_cropped = binary_points[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1]
+        prompt_reflection = (
+            binary_points_cropped.unsqueeze(0).unsqueeze(0),
+            global_preds.unsqueeze(0).unsqueeze(0)
+        )
+    ## inference
+    with torch.no_grad():
+        logits_single_cropped = sliding_window_inference(
+                image_single_cropped, prompt_reflection,
+                args.spatial_size, 1, segvol_model, args.infer_overlap,
+                text=text_single,
+                use_box=args.use_box_prompt,
+                use_point=args.use_point_prompt,
+                logits_global_single=logits_global_single,
+            )
+        logits_single_cropped = logits_single_cropped.cpu().squeeze()
+        if logits_single_cropped.shape != logits_global_single.shape:
+            logits_global_single[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1] = logits_single_cropped
+    return logits_global_single
+@st.cache_resource
+def build_model():
+    # build model
+    clip_ckpt = 'model/config/clip'
+    resume = 'model/asset/SegVol_v1.pth'
+    sam_model = sam_model_registry['vit']()
+    segvol_model = SegVol(
+                        image_encoder=sam_model.image_encoder,
+                        mask_decoder=sam_model.mask_decoder,
+                        prompt_encoder=sam_model.prompt_encoder,
+                        clip_ckpt=clip_ckpt,
+                        roi_size=(32,256,256),
+                        patch_size=(4,16,16),
+                        test_mode=True,
+                        )
+    segvol_model = torch.nn.DataParallel(segvol_model)
+    segvol_model.eval()
+    # load param
+    if os.path.isfile(resume):
+        ## Map model to be loaded to specified single GPU
+        loc = 'cpu'
+        checkpoint = torch.load(resume, map_location=loc)
+        segvol_model.load_state_dict(checkpoint['model'], strict=True)
+        print("loaded checkpoint '{}' (epoch {})".format(resume, checkpoint['epoch']))
+    print('model build done!')
+    return segvol_model
+@st.cache_data
+def inference_case(_image, _image_zoom_out, _point_prompt, text_prompt, _box_prompt):
+    # seg config
+    args = set_parse()
+    args.use_zoom_in = True
+    args.use_text_prompt = text_prompt is not None
+    args.use_box_prompt = _box_prompt is not None
+    args.use_point_prompt = _point_prompt is not None
+    segvol_model = build_model()
+    # run inference
+    logits = zoom_in_zoom_out(
+        args, segvol_model,
+        _image.unsqueeze(0), _image_zoom_out.unsqueeze(0),
+        text_prompt, _point_prompt, _box_prompt)
+    print(logits.shape)
+    resize_transform = transforms.Compose([
+        transforms.AddChannel(),
+        transforms.Resize((325,325,325), mode='trilinear')
+    ]
+    )
+    logits = resize_transform(logits)[0]
+    print(logits.shape)
+    return (torch.sigmoid(logits) > 0.5).int().numpy()

model/inference_demo.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import argparse
+import os
+import torch
+import torch.nn.functional as F
+import json
+from segment_anything_volumetric import sam_model_registry
+from network.model import SegVol
+from data_process.demo_data_process import process_ct_gt
+import monai.transforms as transforms
+from utils.monai_inferers_utils import sliding_window_inference, generate_box, select_points, build_binary_cube, build_binary_points, logits2roi_coor
+from utils.visualize import draw_result
+def set_parse():
+    # %% set up parser
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test_mode", default=True, type=bool)
+    parser.add_argument("--resume", type = str, default = '')
+    parser.add_argument("-infer_overlap", default=0.5, type=float, help="sliding window inference overlap")
+    parser.add_argument("-spatial_size", default=(32, 256, 256), type=tuple)
+    parser.add_argument("-patch_size", default=(4, 16, 16), type=tuple)
+    parser.add_argument('-work_dir', type=str, default='./work_dir')
+    ### demo
+    parser.add_argument('--demo_config', type=str, required=True)
+    parser.add_argument("--clip_ckpt", type = str, default = './config/clip')
+    args = parser.parse_args()
+    return args
+def dice_score(preds, labels):  # on GPU
+    assert preds.shape[0] == labels.shape[0], "predict & target batch size don't match\n" + str(preds.shape) + str(labels.shape)
+    predict = preds.view(1, -1)
+    target = labels.view(1, -1)
+    if target.shape[1] < 1e8:
+        predict = predict.cuda()
+        target = target.cuda()
+    predict = torch.sigmoid(predict)
+    predict = torch.where(predict > 0.5, 1., 0.)
+    tp = torch.sum(torch.mul(predict, target))
+    den = torch.sum(predict) + torch.sum(target) + 1
+    dice = 2 * tp / den
+    if target.shape[1] < 1e8:
+        predict = predict.cpu()
+        target = target.cpu()
+    return dice
+def zoom_in_zoom_out(args, segvol_model, image, image_resize, gt3D, gt3D_resize, categories=None):
+    logits_labels_record = {}
+    image_single_resize = image_resize
+    image_single = image[0,0]
+    ori_shape = image_single.shape
+    for item_idx in range(len(categories)):
+        # get label to generate prompts
+        label_single = gt3D[0][item_idx]
+        label_single_resize = gt3D_resize[0][item_idx]
+        # skip meaningless categories
+        if torch.sum(label_single) == 0:
+            print('No object, skip')
+            continue
+        # generate prompts
+        text_single = categories[item_idx] if args.use_text_prompt else None
+        if categories is not None: print(f'inference |{categories[item_idx]}| target...')
+        points_single = None
+        box_single = None
+        if args.use_point_prompt:
+            point, point_label = select_points(label_single_resize, num_positive_extra=3, num_negative_extra=3)
+            points_single = (point.unsqueeze(0).float().cuda(), point_label.unsqueeze(0).float().cuda())
+            binary_points_resize = build_binary_points(point, point_label, label_single_resize.shape)
+        if args.use_box_prompt:
+            box_single = generate_box(label_single_resize).unsqueeze(0).float().cuda()
+            binary_cube_resize = build_binary_cube(box_single, binary_cube_shape=label_single_resize.shape)
+        ####################
+        # zoom-out inference:
+        print('--- zoom out inference ---')
+        print(f'use text-prompt [{text_single!=None}], use box-prompt [{box_single!=None}], use point-prompt [{points_single!=None}]')
+        with torch.no_grad():
+            logits_global_single = segvol_model(image_single_resize.cuda(),
+                                                text=text_single,
+                                                boxes=box_single,
+                                                points=points_single)
+        # resize back global logits
+        logits_global_single = F.interpolate(
+                logits_global_single.cpu(),
+                size=ori_shape, mode='nearest')[0][0]
+        # build prompt reflection for zoom-in
+        if args.use_point_prompt:
+            binary_points = F.interpolate(
+                binary_points_resize.unsqueeze(0).unsqueeze(0).float(),
+                size=ori_shape, mode='nearest')[0][0]
+        if args.use_box_prompt:
+            binary_cube = F.interpolate(
+                binary_cube_resize.unsqueeze(0).unsqueeze(0).float(),
+                size=ori_shape, mode='nearest')[0][0]
+        zoom_out_dice = dice_score(logits_global_single.squeeze(), label_single.squeeze())
+        logits_labels_record[categories[item_idx]] = (
+            zoom_out_dice,
+            image_single,
+            points_single,
+            box_single,
+            logits_global_single,
+            label_single)
+        print(f'zoom out inference done with zoom_out_dice: {zoom_out_dice:.4f}')
+        if not args.use_zoom_in:
+            continue
+        ####################
+        # zoom-in inference:
+        min_d, min_h, min_w, max_d, max_h, max_w = logits2roi_coor(args.spatial_size, logits_global_single)
+        if min_d is None:
+            print('Fail to detect foreground!')
+            continue
+        # Crop roi
+        image_single_cropped = image_single[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1].unsqueeze(0).unsqueeze(0)
+        global_preds = (torch.sigmoid(logits_global_single[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1])>0.5).long()
+        assert not (args.use_box_prompt and args.use_point_prompt)
+        # label_single_cropped = label_single[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1].unsqueeze(0).unsqueeze(0)
+        prompt_reflection = None
+        if args.use_box_prompt:
+            binary_cube_cropped = binary_cube[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1]
+            prompt_reflection = (
+                binary_cube_cropped.unsqueeze(0).unsqueeze(0),
+                global_preds.unsqueeze(0).unsqueeze(0)
+            )
+        if args.use_point_prompt:
+            binary_points_cropped = binary_points[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1]
+            prompt_reflection = (
+                binary_points_cropped.unsqueeze(0).unsqueeze(0),
+                global_preds.unsqueeze(0).unsqueeze(0)
+            )
+        ## inference
+        with torch.no_grad():
+            logits_single_cropped = sliding_window_inference(
+                    image_single_cropped.cuda(), prompt_reflection,
+                    args.spatial_size, 1, segvol_model, args.infer_overlap,
+                    text=text_single,
+                    use_box=args.use_box_prompt,
+                    use_point=args.use_point_prompt,
+                )
+            logits_single_cropped = logits_single_cropped.cpu().squeeze()
+        logits_global_single[min_d:max_d+1, min_h:max_h+1, min_w:max_w+1] = logits_single_cropped
+        zoom_in_dice = dice_score(logits_global_single.squeeze(), label_single.squeeze())
+        logits_labels_record[categories[item_idx]] = (
+            zoom_in_dice,
+            image_single,
+            points_single,
+            box_single,
+            logits_global_single,
+            label_single)
+        print(f'===> zoom out dice {zoom_out_dice:.4f} -> zoom-out-zoom-in dice {zoom_in_dice:.4f} <===')
+    return logits_labels_record
+def inference_single_ct(args, segvol_model, data_item, categories):
+    segvol_model.eval()
+    image, gt3D = data_item["image"].float(), data_item["label"]
+    image_zoom_out, gt3D__zoom_out = data_item["zoom_out_image"].float(), data_item['zoom_out_label']
+    logits_labels_record = zoom_in_zoom_out(
+        args, segvol_model,
+        image.unsqueeze(0), image_zoom_out.unsqueeze(0),
+        gt3D.unsqueeze(0), gt3D__zoom_out.unsqueeze(0),     # add batch dim
+        categories=categories)
+    # visualize
+    if args.visualize:
+        for target, values in logits_labels_record.items():
+            dice_score, image, point_prompt, box_prompt, logits, labels = values
+            print(f'{target} result with Dice score {dice_score:.4f} visualizing')
+            draw_result(target + f"-Dice {dice_score:.4f}", image, box_prompt, point_prompt, logits, labels, args.spatial_size, args.work_dir)
+def main(args):
+    gpu = 0
+    torch.cuda.set_device(gpu)
+    # build model
+    sam_model = sam_model_registry['vit'](args=args)
+    segvol_model = SegVol(
+                        image_encoder=sam_model.image_encoder,
+                        mask_decoder=sam_model.mask_decoder,
+                        prompt_encoder=sam_model.prompt_encoder,
+                        clip_ckpt=args.clip_ckpt,
+                        roi_size=args.spatial_size,
+                        patch_size=args.patch_size,
+                        test_mode=args.test_mode,
+                        ).cuda()
+    segvol_model = torch.nn.DataParallel(segvol_model, device_ids=[gpu])
+    # load param
+    if os.path.isfile(args.resume):
+        ## Map model to be loaded to specified single GPU
+        loc = 'cuda:{}'.format(gpu)
+        checkpoint = torch.load(args.resume, map_location=loc)
+        segvol_model.load_state_dict(checkpoint['model'], strict=True)
+        print("loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
+    # load demo config
+    with open(args.demo_config, 'r') as file:
+        config_dict = json.load(file)
+    ct_path, gt_path, categories = config_dict['demo_case']['ct_path'], config_dict['demo_case']['gt_path'], config_dict['categories']
+    # preprocess for data
+    data_item = process_ct_gt(ct_path, gt_path, categories, args.spatial_size)   # keys: image, label
+    # seg config for prompt & zoom-in-zoom-out
+    args.use_zoom_in = True
+    args.use_text_prompt = True
+    args.use_box_prompt = True
+    args.use_point_prompt = False
+    args.visualize = False
+    inference_single_ct(args, segvol_model, data_item, categories)
+if __name__ == "__main__":
+    args = set_parse()
+    main(args)

model/network/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (3.28 kB). View file

model/network/model.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from transformers import AutoTokenizer, CLIPTextModel, CLIPTextConfig
+#%% set up model
+class SegVol(nn.Module):
+    def __init__(self,
+                image_encoder,
+                mask_decoder,
+                prompt_encoder,
+                clip_ckpt,
+                roi_size,
+                patch_size,
+                test_mode=False,
+                ):
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.mask_decoder = mask_decoder
+        self.prompt_encoder = prompt_encoder
+        self.text_encoder = TextEncoder(clip_ckpt)
+        self.feat_shape = np.array(roi_size)/np.array(patch_size)
+        self.test_mode = test_mode
+    def forward(self, image, text=None, boxes=None, points=None, **kwargs):
+        bs = image.shape[0]
+        img_shape = (image.shape[2], image.shape[3], image.shape[4])
+        image_embedding, _ = self.image_encoder(image)
+        image_embedding = image_embedding.transpose(1, 2).view(bs, -1,
+            int(self.feat_shape[0]), int(self.feat_shape[1]), int(self.feat_shape[2]))
+        # test mode
+        if self.test_mode:
+            return self.forward_decoder(image_embedding, img_shape, text, boxes, points)
+        # train mode
+        # future release
+    def forward_decoder(self, image_embedding, img_shape, text=None, boxes=None, points=None):
+        with torch.no_grad():
+            if boxes is not None:
+                if len(boxes.shape) == 2:
+                    boxes = boxes[:, None, :] # (B, 1, 6)
+            if text is not None:
+                text_embedding = self.text_encoder(text)  # (B, 768)
+            else:
+                text_embedding = None
+        sparse_embeddings, dense_embeddings = self.prompt_encoder(
+            points=points,
+            boxes=boxes,
+            masks=None,
+            text_embedding=text_embedding,
+        )
+        dense_pe = self.prompt_encoder.get_dense_pe()
+        low_res_masks, _ = self.mask_decoder(
+            image_embeddings=image_embedding,
+            text_embedding = text_embedding,
+            image_pe=dense_pe,
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=False,
+          )
+        logits = F.interpolate(low_res_masks, size=img_shape, mode='trilinear', align_corners=False)
+        return logits
+class TextEncoder(nn.Module):
+    def __init__(self, clip_ckpt):
+        super().__init__()
+        config = CLIPTextConfig()
+        self.clip_text_model = CLIPTextModel(config)
+        self.tokenizer = AutoTokenizer.from_pretrained(clip_ckpt)
+        self.dim_align = nn.Linear(512, 768)
+        # freeze text encoder
+        for param in self.clip_text_model.parameters():
+            param.requires_grad = False
+    def organ2tokens(self, organ_names):
+        text_list = ['A computerized tomography of a {}.'.format(organ_name) for organ_name in organ_names]
+        tokens = self.tokenizer(text_list, padding=True, return_tensors="pt")
+        return tokens
+    def forward(self, text):
+        if text is None:
+            return None
+        if type(text) is str:
+            text = [text]
+        tokens = self.organ2tokens(text)
+        clip_outputs = self.clip_text_model(**tokens)
+        text_embedding = clip_outputs.pooler_output
+        text_embedding = self.dim_align(text_embedding)
+        return text_embedding

model/script/inference_demo.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+export segvol_ckpt="path/to/SegVol_v1.pth"
+export work_dir="path/to/work_dir"
+export demo_config_path="./config/config_demo.json"
+CUDA_VISIBLE_DEVICES=0 python inference_demo.py \
+--resume $segvol_ckpt \
+-work_dir $work_dir \
+--demo_config $demo_config_path

model/segment_anything_volumetric/.ipynb_checkpoints/build_sam-checkpoint.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import partial
+from pathlib import Path
+import urllib.request
+import torch
+from .modeling import (
+    ImageEncoderViT,
+    MaskDecoder,
+    PromptEncoder,
+    Sam,
+    TwoWayTransformer,
+)
+from .modeling.image_encoder_swin import SwinTransformer
+from monai.utils import ensure_tuple_rep, optional_import
+def build_sam_vit_h(checkpoint=None, image_size=1024):
+    return _build_sam(
+        encoder_embed_dim=1280,
+        encoder_depth=32,
+        encoder_num_heads=16,
+        encoder_global_attn_indexes=[7, 15, 23, 31],
+        checkpoint=checkpoint,
+        image_size=image_size,
+    )
+build_sam = build_sam_vit_h
+def build_sam_vit_l(checkpoint=None, image_size=1024):
+    return _build_sam(
+        encoder_embed_dim=1024,
+        encoder_depth=24,
+        encoder_num_heads=16,
+        encoder_global_attn_indexes=[5, 11, 17, 23],
+        checkpoint=checkpoint,
+        image_size=image_size,
+    )
+def build_sam_vit_b(checkpoint=None, image_size=1024):
+    return _build_sam(
+        encoder_embed_dim=768,
+        encoder_depth=12,
+        encoder_num_heads=12,
+        encoder_global_attn_indexes=[2, 5, 8, 11],
+        checkpoint=checkpoint,
+        image_size=image_size,
+    )
+"""
+Examples::
+            # for 3D single channel input with size (96,96,96), 4-channel output and feature size of 48.
+            >>> net = SwinUNETR(img_size=(96,96,96), in_channels=1, out_channels=4, feature_size=48)
+            # for 3D 4-channel input with size (128,128,128), 3-channel output and (2,4,2,2) layers in each stage.
+            >>> net = SwinUNETR(img_size=(128,128,128), in_channels=4, out_channels=3, depths=(2,4,2,2))
+            # for 2D single channel input with size (96,96), 2-channel output and gradient checkpointing.
+            >>> net = SwinUNETR(img_size=(96,96), in_channels=3, out_channels=2, use_checkpoint=True, spatial_dims=2)
+"""
+def build_sam_vit_swin(checkpoint=None, image_size=96):
+    print('==> build_sam_vit_swin')
+    return _build_sam(
+        encoder_embed_dim=48,
+        encoder_depth=12,
+        encoder_num_heads=12,
+        encoder_global_attn_indexes=[2, 5, 8, 11],
+        checkpoint=checkpoint,
+        image_size=image_size,
+    )
+sam_model_registry = {
+    "default": build_sam_vit_h,
+    "vit_h": build_sam_vit_h,
+    "vit_l": build_sam_vit_l,
+    "vit_b": build_sam_vit_b,
+    "swin_vit": build_sam_vit_swin,
+}
+def _build_sam(
+    encoder_embed_dim,
+    encoder_depth,
+    encoder_num_heads,
+    encoder_global_attn_indexes,
+    checkpoint=None,
+    image_size=None,
+    spatial_dims=3,
+):
+    prompt_embed_dim = 768
+    patch_size = ensure_tuple_rep(2, spatial_dims)
+    window_size = ensure_tuple_rep(7, spatial_dims)
+    image_embedding_size = [size // 32 for size in image_size]
+    sam = Sam(
+        image_encoder=SwinTransformer(
+            in_chans=1,
+            embed_dim=encoder_embed_dim,
+            window_size=window_size,
+            patch_size=patch_size,
+            depths=(2, 2, 6, 2), #(2, 2, 6, 2),
+            num_heads=(3, 6, 12, 24),
+            mlp_ratio=4.0,
+            qkv_bias=True,
+            spatial_dims=spatial_dims,
+        ),
+        prompt_encoder=PromptEncoder(
+            embed_dim=prompt_embed_dim,
+            image_embedding_size=image_embedding_size,
+            input_image_size=image_size,
+            mask_in_chans=16,
+        ),
+        mask_decoder=MaskDecoder(
+            num_multimask_outputs=3,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=prompt_embed_dim,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+            transformer_dim=prompt_embed_dim,
+            iou_head_depth=3,
+            iou_head_hidden_dim=256,
+        ),
+        pixel_mean=[123.675, 116.28, 103.53],
+        pixel_std=[58.395, 57.12, 57.375],
+    )
+    sam.eval()
+    if checkpoint is not None:
+        checkpoint = Path(checkpoint)
+        if checkpoint.name == "sam_vit_b_01ec64.pth" and not checkpoint.exists():
+            cmd = input("Download sam_vit_b_01ec64.pth from facebook AI? [y]/n: ")
+            if len(cmd) == 0 or cmd.lower() == 'y':
+                checkpoint.parent.mkdir(parents=True, exist_ok=True)
+                print("Downloading SAM ViT-B checkpoint...")
+                urllib.request.urlretrieve(
+                    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth",
+                    checkpoint,
+                )
+                print(checkpoint.name, " is downloaded!")
+        elif checkpoint.name == "sam_vit_h_4b8939.pth" and not checkpoint.exists():
+            cmd = input("Download sam_vit_h_4b8939.pth from facebook AI? [y]/n: ")
+            if len(cmd) == 0 or cmd.lower() == 'y':
+                checkpoint.parent.mkdir(parents=True, exist_ok=True)
+                print("Downloading SAM ViT-H checkpoint...")
+                urllib.request.urlretrieve(
+                    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
+                    checkpoint,
+                )
+                print(checkpoint.name, " is downloaded!")
+        elif checkpoint.name == "sam_vit_l_0b3195.pth" and not checkpoint.exists():
+            cmd = input("Download sam_vit_l_0b3195.pth from facebook AI? [y]/n: ")
+            if len(cmd) == 0 or cmd.lower() == 'y':
+                checkpoint.parent.mkdir(parents=True, exist_ok=True)
+                print("Downloading SAM ViT-L checkpoint...")
+                urllib.request.urlretrieve(
+                    "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
+                    checkpoint,
+                )
+                print(checkpoint.name, " is downloaded!")
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f)
+        sam.load_state_dict(state_dict)
+    return sam

model/segment_anything_volumetric/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .build_sam import (
+    build_sam_vit_3d,
+    sam_model_registry,
+)
+from .predictor import SamPredictor
+from .automatic_mask_generator import SamAutomaticMaskGenerator

model/segment_anything_volumetric/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (407 Bytes). View file

model/segment_anything_volumetric/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (377 Bytes). View file

model/segment_anything_volumetric/__pycache__/automatic_mask_generator.cpython-310.pyc ADDED Viewed

Binary file (11.4 kB). View file

model/segment_anything_volumetric/__pycache__/automatic_mask_generator.cpython-39.pyc ADDED Viewed

Binary file (11.4 kB). View file

model/segment_anything_volumetric/__pycache__/build_sam.cpython-310.pyc ADDED Viewed

Binary file (3.3 kB). View file

model/segment_anything_volumetric/__pycache__/build_sam.cpython-39.pyc ADDED Viewed

Binary file (2.62 kB). View file

model/segment_anything_volumetric/__pycache__/predictor.cpython-310.pyc ADDED Viewed

Binary file (9.96 kB). View file

model/segment_anything_volumetric/__pycache__/predictor.cpython-39.pyc ADDED Viewed

Binary file (9.98 kB). View file

model/segment_anything_volumetric/automatic_mask_generator.py ADDED Viewed

	@@ -0,0 +1,372 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
+from typing import Any, Dict, List, Optional, Tuple
+from .modeling import Sam
+from .predictor import SamPredictor
+from .utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+class SamAutomaticMaskGenerator:
+    def __init__(
+        self,
+        model: Sam,
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.88,
+        stability_score_thresh: float = 0.95,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 0,
+        output_mode: str = "binary_mask",
+    ) -> None:
+        """
+        Using a SAM model, generates masks for the entire image.
+        Generates a grid of point prompts over the image, then filters
+        low quality and duplicate masks. The default settings are chosen
+        for SAM with a ViT-H backbone.
+        Arguments:
+          model (Sam): The SAM model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crop_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+        """
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), "Exactly one of points_per_side or point_grid must be provided."
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError("Can't have both points_per_side and point_grid be None.")
+        assert output_mode in [
+            "binary_mask",
+            "uncompressed_rle",
+            "coco_rle",
+        ], f"Unknown output_mode {output_mode}."
+        if output_mode == "coco_rle":
+            from pycocotools import mask as mask_utils  # type: ignore # noqa: F401
+        if min_mask_region_area > 0:
+            import cv2  # type: ignore # noqa: F401
+        self.predictor = SamPredictor(model)
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """
+        Generates masks for the given image.
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """
+        # Generate masks
+        mask_data = self._generate_masks(image)
+        # Filter small disconnected regions and holes in masks
+        if self.min_mask_region_area > 0:
+            mask_data = self.postprocess_small_regions(
+                mask_data,
+                self.min_mask_region_area,
+                max(self.box_nms_thresh, self.crop_nms_thresh),
+            )
+        # Encode masks
+        if self.output_mode == "coco_rle":
+            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+        elif self.output_mode == "binary_mask":
+            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+        else:
+            mask_data["segmentations"] = mask_data["rles"]
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data["segmentations"])):
+            ann = {
+                "segmentation": mask_data["segmentations"][idx],
+                "area": area_from_rle(mask_data["rles"][idx]),
+                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+                "predicted_iou": mask_data["iou_preds"][idx].item(),
+                "point_coords": [mask_data["points"][idx].tolist()],
+                "stability_score": mask_data["stability_score"][idx].item(),
+                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+        return curr_anns
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[:2]
+        crop_boxes, layer_idxs = generate_crop_boxes(
+            orig_size, self.crop_n_layers, self.crop_overlap_ratio
+        )
+        # Iterate over image crops
+        data = MaskData()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+            data.cat(crop_data)
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data["crop_boxes"])
+            scores = scores.to(data["boxes"].device)
+            keep_by_nms = batched_nms(
+                data["boxes"].float(),
+                scores,
+                torch.zeros_like(data["boxes"][:, 0]),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+        data.to_numpy()
+        return data
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[:2]
+        self.predictor.set_image(cropped_im)
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] * points_scale
+        # Generate masks for this crop in batches
+        data = MaskData()
+        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+            batch_data = self._process_batch(points, cropped_im_size, crop_box, orig_size)
+            data.cat(batch_data)
+            del batch_data
+        self.predictor.reset_image()
+        # Remove duplicates within this crop.
+        keep_by_nms = batched_nms(
+            data["boxes"].float(),
+            data["iou_preds"],
+            torch.zeros_like(data["boxes"][:, 0]),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+        data.filter(keep_by_nms)
+        # Return to the original image frame
+        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+        data["points"] = uncrop_points(data["points"], crop_box)
+        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+        return data
+    def _process_batch(
+        self,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+        # Run model on this batch
+        transformed_points = self.predictor.transform.apply_coords(points, im_size)
+        in_points = torch.as_tensor(transformed_points, device=self.predictor.device)
+        in_labels = torch.ones(in_points.shape[0], dtype=torch.int, device=in_points.device)
+        masks, iou_preds, _ = self.predictor.predict_torch(
+            in_points[:, None, :],
+            in_labels[:, None],
+            multimask_output=True,
+            return_logits=True,
+        )
+        # Serialize predictions and store in MaskData
+        data = MaskData(
+            masks=masks.flatten(0, 1),
+            iou_preds=iou_preds.flatten(0, 1),
+            points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
+        )
+        del masks
+        # Filter by predicted IoU
+        if self.pred_iou_thresh > 0.0:
+            keep_mask = data["iou_preds"] > self.pred_iou_thresh
+            data.filter(keep_mask)
+        # Calculate stability score
+        data["stability_score"] = calculate_stability_score(
+            data["masks"], self.predictor.model.mask_threshold, self.stability_score_offset
+        )
+        if self.stability_score_thresh > 0.0:
+            keep_mask = data["stability_score"] >= self.stability_score_thresh
+            data.filter(keep_mask)
+        # Threshold masks and calculate boxes
+        data["masks"] = data["masks"] > self.predictor.model.mask_threshold
+        data["boxes"] = batched_mask_to_box(data["masks"])
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+        # Compress to RLE
+        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+        data["rles"] = mask_to_rle_pytorch(data["masks"])
+        del data["masks"]
+        return data
+    @staticmethod
+    def postprocess_small_regions(
+        mask_data: MaskData, min_area: int, nms_thresh: float
+    ) -> MaskData:
+        """
+        Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+        Edits mask_data in place.
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data["rles"]) == 0:
+            return mask_data
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data["rles"]:
+            mask = rle_to_mask(rle)
+            mask, changed = remove_small_regions(mask, min_area, mode="holes")
+            unchanged = not changed
+            mask, changed = remove_small_regions(mask, min_area, mode="islands")
+            unchanged = unchanged and not changed
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros_like(boxes[:, 0]),  # categories
+            iou_threshold=nms_thresh,
+        )
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+        return mask_data

model/segment_anything_volumetric/build_sam.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import partial
+from pathlib import Path
+import urllib.request
+import torch
+from .modeling import (
+    ImageEncoderViT,
+    MaskDecoder,
+    PromptEncoder,
+    Sam,
+    TwoWayTransformer,
+)
+import numpy as np
+from .modeling.image_encoder_swin import SwinTransformer
+from monai.networks.nets import ViT
+from monai.networks.nets.swin_unetr import SwinTransformer as SwinViT
+from monai.utils import ensure_tuple_rep, optional_import
+"""
+Examples::
+            # for 3D single channel input with size (96,96,96), 4-channel output and feature size of 48.
+            >>> net = SwinUNETR(img_size=(96,96,96), in_channels=1, out_channels=4, feature_size=48)
+            # for 3D 4-channel input with size (128,128,128), 3-channel output and (2,4,2,2) layers in each stage.
+            >>> net = SwinUNETR(img_size=(128,128,128), in_channels=4, out_channels=3, depths=(2,4,2,2))
+            # for 2D single channel input with size (96,96), 2-channel output and gradient checkpointing.
+            >>> net = SwinUNETR(img_size=(96,96), in_channels=3, out_channels=2, use_checkpoint=True, spatial_dims=2)
+"""
+def build_sam_vit_3d(checkpoint=None):
+    print('build_sam_vit_3d...')
+    return _build_sam(
+        image_encoder_type='vit',
+        embed_dim = 768,
+        patch_size=[4,16,16],
+        checkpoint=checkpoint,
+        image_size=[32,256,256],
+    )
+sam_model_registry = {
+    "vit": build_sam_vit_3d,
+}
+def _build_sam(
+    image_encoder_type,
+    embed_dim,
+    patch_size,
+    checkpoint,
+    image_size,
+):
+    mlp_dim = 3072
+    num_layers = 12
+    num_heads = 12
+    pos_embed = 'perceptron'
+    dropout_rate = 0.0
+    image_encoder=ViT(
+        in_channels=1,
+        img_size=image_size,
+        patch_size=patch_size,
+        hidden_size=embed_dim,
+        mlp_dim=mlp_dim,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        pos_embed=pos_embed,
+        classification=False,
+        dropout_rate=dropout_rate,
+    )
+    image_embedding_size = [int(item) for item in (np.array(image_size) / np.array(patch_size))]
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f, map_location='cpu')['state_dict']
+            encoder_dict = {k.replace('model.encoder.', ''): v for k, v in state_dict.items() if 'model.encoder.' in k}
+        image_encoder.load_state_dict(encoder_dict)
+        print(f'===> image_encoder.load_param: {checkpoint}')
+    sam = Sam(
+        image_encoder=image_encoder,
+        prompt_encoder=PromptEncoder(
+            embed_dim=embed_dim,
+            image_embedding_size=image_embedding_size,
+            input_image_size=image_size,
+            mask_in_chans=16,
+        ),
+        mask_decoder=MaskDecoder(
+            image_encoder_type=image_encoder_type,
+            num_multimask_outputs=3,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=embed_dim,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+            transformer_dim=embed_dim,
+            iou_head_depth=3,
+            iou_head_hidden_dim=256,
+            image_size=np.array(image_size),
+            patch_size=np.array(patch_size),
+        ),
+        pixel_mean=[123.675, 116.28, 103.53],
+        pixel_std=[58.395, 57.12, 57.375],
+    )
+    sam.eval()
+    return sam

model/segment_anything_volumetric/modeling/.ipynb_checkpoints/image_encoder_swin-checkpoint.py ADDED Viewed

	@@ -0,0 +1,709 @@

+from typing import Sequence, Tuple, Type, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch.nn import LayerNorm
+from monai.networks.blocks import MLPBlock as Mlp
+from monai.networks.blocks import PatchEmbed, UnetOutBlock, UnetrBasicBlock, UnetrUpBlock
+from monai.networks.layers import DropPath, trunc_normal_
+from monai.utils import ensure_tuple_rep, optional_import
+rearrange, _ = optional_import("einops", name="rearrange")
+def window_partition(x, window_size):
+    """window partition operation based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+     Args:
+        x: input tensor.
+        window_size: local window size.
+    """
+    x_shape = x.size()
+    if len(x_shape) == 5:
+        b, d, h, w, c = x_shape
+        x = x.view(
+            b,
+            d // window_size[0],
+            window_size[0],
+            h // window_size[1],
+            window_size[1],
+            w // window_size[2],
+            window_size[2],
+            c,
+        )
+        windows = (
+            x.permute(0, 1, 3, 5, 2, 4, 6, 7).contiguous().view(-1, window_size[0] * window_size[1] * window_size[2], c)
+        )
+    elif len(x_shape) == 4:
+        b, h, w, c = x.shape
+        x = x.view(b, h // window_size[0], window_size[0], w // window_size[1], window_size[1], c)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0] * window_size[1], c)
+    return windows
+def window_reverse(windows, window_size, dims):
+    """window reverse operation based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+     Args:
+        windows: windows tensor.
+        window_size: local window size.
+        dims: dimension values.
+    """
+    if len(dims) == 4:
+        b, d, h, w = dims
+        x = windows.view(
+            b,
+            d // window_size[0],
+            h // window_size[1],
+            w // window_size[2],
+            window_size[0],
+            window_size[1],
+            window_size[2],
+            -1,
+        )
+        x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(b, d, h, w, -1)
+    elif len(dims) == 3:
+        b, h, w = dims
+        x = windows.view(b, h // window_size[0], w // window_size[0], window_size[0], window_size[1], -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(b, h, w, -1)
+    return x
+def get_window_size(x_size, window_size, shift_size=None):
+    """Computing window size based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+     Args:
+        x_size: input size.
+        window_size: local window size.
+        shift_size: window shifting size.
+    """
+    use_window_size = list(window_size)
+    if shift_size is not None:
+        use_shift_size = list(shift_size)
+    for i in range(len(x_size)):
+        if x_size[i] <= window_size[i]:
+            use_window_size[i] = x_size[i]
+            if shift_size is not None:
+                use_shift_size[i] = 0
+    if shift_size is None:
+        return tuple(use_window_size)
+    else:
+        return tuple(use_window_size), tuple(use_shift_size)
+class WindowAttention(nn.Module):
+    """
+    Window based multi-head self attention module with relative position bias based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        window_size: Sequence[int],
+        qkv_bias: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        """
+        Args:
+            dim: number of feature channels.
+            num_heads: number of attention heads.
+            window_size: local window size.
+            qkv_bias: add a learnable bias to query, key, value.
+            attn_drop: attention dropout rate.
+            proj_drop: dropout rate of output.
+        """
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        mesh_args = torch.meshgrid.__kwdefaults__
+        if len(self.window_size) == 3:
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(
+                    (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1),
+                    num_heads,
+                )
+            )
+            coords_d = torch.arange(self.window_size[0])
+            coords_h = torch.arange(self.window_size[1])
+            coords_w = torch.arange(self.window_size[2])
+            if mesh_args is not None:
+                coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w, indexing="ij"))
+            else:
+                coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w))
+            coords_flatten = torch.flatten(coords, 1)
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+            relative_coords[:, :, 0] += self.window_size[0] - 1
+            relative_coords[:, :, 1] += self.window_size[1] - 1
+            relative_coords[:, :, 2] += self.window_size[2] - 1
+            relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+            relative_coords[:, :, 1] *= 2 * self.window_size[2] - 1
+        elif len(self.window_size) == 2:
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+            )
+            coords_h = torch.arange(self.window_size[0])
+            coords_w = torch.arange(self.window_size[1])
+            if mesh_args is not None:
+                coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"))
+            else:
+                coords = torch.stack(torch.meshgrid(coords_h, coords_w))
+            coords_flatten = torch.flatten(coords, 1)
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+            relative_coords[:, :, 0] += self.window_size[0] - 1
+            relative_coords[:, :, 1] += self.window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask):
+        b, n, c = x.shape
+        qkv = self.qkv(x).reshape(b, n, 3, self.num_heads, c // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.clone()[:n, :n].reshape(-1)
+        ].reshape(n, n, -1)
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nw = mask.shape[0]
+            attn = attn.view(b // nw, nw, self.num_heads, n, n) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, n, n)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(b, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """
+    Swin Transformer block based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        window_size: Sequence[int],
+        shift_size: Sequence[int],
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        drop_path: float = 0.0,
+        act_layer: str = "GELU",
+        norm_layer: Type[LayerNorm] = nn.LayerNorm,  # type: ignore
+        use_checkpoint: bool = False,
+    ) -> None:
+        """
+        Args:
+            dim: number of feature channels.
+            num_heads: number of attention heads.
+            window_size: local window size.
+            shift_size: window shift size.
+            mlp_ratio: ratio of mlp hidden dim to embedding dim.
+            qkv_bias: add a learnable bias to query, key, value.
+            drop: dropout rate.
+            attn_drop: attention dropout rate.
+            drop_path: stochastic depth rate.
+            act_layer: activation layer.
+            norm_layer: normalization layer.
+            use_checkpoint: use gradient checkpointing for reduced memory usage.
+        """
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.use_checkpoint = use_checkpoint
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=self.window_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(hidden_size=dim, mlp_dim=mlp_hidden_dim, act=act_layer, dropout_rate=drop, dropout_mode="swin")
+    def forward_part1(self, x, mask_matrix):
+        x_shape = x.size()
+        x = self.norm1(x)
+        if len(x_shape) == 5:
+            b, d, h, w, c = x.shape
+            window_size, shift_size = get_window_size((d, h, w), self.window_size, self.shift_size)
+            pad_l = pad_t = pad_d0 = 0
+            pad_d1 = (window_size[0] - d % window_size[0]) % window_size[0]
+            pad_b = (window_size[1] - h % window_size[1]) % window_size[1]
+            pad_r = (window_size[2] - w % window_size[2]) % window_size[2]
+            x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1))
+            _, dp, hp, wp, _ = x.shape
+            dims = [b, dp, hp, wp]
+        elif len(x_shape) == 4:
+            b, h, w, c = x.shape
+            window_size, shift_size = get_window_size((h, w), self.window_size, self.shift_size)
+            pad_l = pad_t = 0
+            pad_r = (window_size[0] - h % window_size[0]) % window_size[0]
+            pad_b = (window_size[1] - w % window_size[1]) % window_size[1]
+            x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+            _, hp, wp, _ = x.shape
+            dims = [b, hp, wp]
+        if any(i > 0 for i in shift_size):
+            if len(x_shape) == 5:
+                shifted_x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), dims=(1, 2, 3))
+            elif len(x_shape) == 4:
+                shifted_x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1]), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        x_windows = window_partition(shifted_x, window_size)
+        attn_windows = self.attn(x_windows, mask=attn_mask)
+        attn_windows = attn_windows.view(-1, *(window_size + (c,)))
+        shifted_x = window_reverse(attn_windows, window_size, dims)
+        if any(i > 0 for i in shift_size):
+            if len(x_shape) == 5:
+                x = torch.roll(shifted_x, shifts=(shift_size[0], shift_size[1], shift_size[2]), dims=(1, 2, 3))
+            elif len(x_shape) == 4:
+                x = torch.roll(shifted_x, shifts=(shift_size[0], shift_size[1]), dims=(1, 2))
+        else:
+            x = shifted_x
+        if len(x_shape) == 5:
+            if pad_d1 > 0 or pad_r > 0 or pad_b > 0:
+                x = x[:, :d, :h, :w, :].contiguous()
+        elif len(x_shape) == 4:
+            if pad_r > 0 or pad_b > 0:
+                x = x[:, :h, :w, :].contiguous()
+        return x
+    def forward_part2(self, x):
+        return self.drop_path(self.mlp(self.norm2(x)))
+    def load_from(self, weights, n_block, layer):
+        root = f"module.{layer}.0.blocks.{n_block}."
+        block_names = [
+            "norm1.weight",
+            "norm1.bias",
+            "attn.relative_position_bias_table",
+            "attn.relative_position_index",
+            "attn.qkv.weight",
+            "attn.qkv.bias",
+            "attn.proj.weight",
+            "attn.proj.bias",
+            "norm2.weight",
+            "norm2.bias",
+            "mlp.fc1.weight",
+            "mlp.fc1.bias",
+            "mlp.fc2.weight",
+            "mlp.fc2.bias",
+        ]
+        with torch.no_grad():
+            self.norm1.weight.copy_(weights["state_dict"][root + block_names[0]])
+            self.norm1.bias.copy_(weights["state_dict"][root + block_names[1]])
+            self.attn.relative_position_bias_table.copy_(weights["state_dict"][root + block_names[2]])
+            self.attn.relative_position_index.copy_(weights["state_dict"][root + block_names[3]])
+            self.attn.qkv.weight.copy_(weights["state_dict"][root + block_names[4]])
+            self.attn.qkv.bias.copy_(weights["state_dict"][root + block_names[5]])
+            self.attn.proj.weight.copy_(weights["state_dict"][root + block_names[6]])
+            self.attn.proj.bias.copy_(weights["state_dict"][root + block_names[7]])
+            self.norm2.weight.copy_(weights["state_dict"][root + block_names[8]])
+            self.norm2.bias.copy_(weights["state_dict"][root + block_names[9]])
+            self.mlp.linear1.weight.copy_(weights["state_dict"][root + block_names[10]])
+            self.mlp.linear1.bias.copy_(weights["state_dict"][root + block_names[11]])
+            self.mlp.linear2.weight.copy_(weights["state_dict"][root + block_names[12]])
+            self.mlp.linear2.bias.copy_(weights["state_dict"][root + block_names[13]])
+    def forward(self, x, mask_matrix):
+        shortcut = x
+        if self.use_checkpoint:
+            x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix)
+        else:
+            x = self.forward_part1(x, mask_matrix)
+        x = shortcut + self.drop_path(x)
+        if self.use_checkpoint:
+            x = x + checkpoint.checkpoint(self.forward_part2, x)
+        else:
+            x = x + self.forward_part2(x)
+        return x
+class PatchMerging(nn.Module):
+    """
+    Patch merging layer based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self, dim: int, norm_layer: Type[LayerNorm] = nn.LayerNorm, spatial_dims: int = 3
+    ) -> None:  # type: ignore
+        """
+        Args:
+            dim: number of feature channels.
+            norm_layer: normalization layer.
+            spatial_dims: number of spatial dims.
+        """
+        super().__init__()
+        self.dim = dim
+        if spatial_dims == 3:
+            self.reduction = nn.Linear(8 * dim, 2 * dim, bias=False)
+            self.norm = norm_layer(8 * dim)
+        elif spatial_dims == 2:
+            self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+            self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        x_shape = x.size()
+        if len(x_shape) == 5:
+            b, d, h, w, c = x_shape
+            pad_input = (h % 2 == 1) or (w % 2 == 1) or (d % 2 == 1)
+            if pad_input:
+                x = F.pad(x, (0, 0, 0, d % 2, 0, w % 2, 0, h % 2))
+            x0 = x[:, 0::2, 0::2, 0::2, :]
+            x1 = x[:, 1::2, 0::2, 0::2, :]
+            x2 = x[:, 0::2, 1::2, 0::2, :]
+            x3 = x[:, 0::2, 0::2, 1::2, :]
+            x4 = x[:, 1::2, 0::2, 1::2, :]
+            x5 = x[:, 0::2, 1::2, 0::2, :]
+            x6 = x[:, 0::2, 0::2, 1::2, :]
+            x7 = x[:, 1::2, 1::2, 1::2, :]
+            x = torch.cat([x0, x1, x2, x3, x4, x5, x6, x7], -1)
+        elif len(x_shape) == 4:
+            b, h, w, c = x_shape
+            pad_input = (h % 2 == 1) or (w % 2 == 1)
+            if pad_input:
+                x = F.pad(x, (0, 0, 0, w % 2, 0, h % 2))
+            x0 = x[:, 0::2, 0::2, :]
+            x1 = x[:, 1::2, 0::2, :]
+            x2 = x[:, 0::2, 1::2, :]
+            x3 = x[:, 1::2, 1::2, :]
+            x = torch.cat([x0, x1, x2, x3], -1)
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+def compute_mask(dims, window_size, shift_size, device):
+    """Computing region masks based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+     Args:
+        dims: dimension values.
+        window_size: local window size.
+        shift_size: shift size.
+        device: device.
+    """
+    cnt = 0
+    if len(dims) == 3:
+        d, h, w = dims
+        img_mask = torch.zeros((1, d, h, w, 1), device=device)
+        for d in slice(-window_size[0]), slice(-window_size[0], -shift_size[0]), slice(-shift_size[0], None):
+            for h in slice(-window_size[1]), slice(-window_size[1], -shift_size[1]), slice(-shift_size[1], None):
+                for w in slice(-window_size[2]), slice(-window_size[2], -shift_size[2]), slice(-shift_size[2], None):
+                    img_mask[:, d, h, w, :] = cnt
+                    cnt += 1
+    elif len(dims) == 2:
+        h, w = dims
+        img_mask = torch.zeros((1, h, w, 1), device=device)
+        for h in slice(-window_size[0]), slice(-window_size[0], -shift_size[0]), slice(-shift_size[0], None):
+            for w in slice(-window_size[1]), slice(-window_size[1], -shift_size[1]), slice(-shift_size[1], None):
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+    mask_windows = window_partition(img_mask, window_size)
+    mask_windows = mask_windows.squeeze(-1)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    return attn_mask
+class BasicLayer(nn.Module):
+    """
+    Basic Swin Transformer layer in one stage based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        num_heads: int,
+        window_size: Sequence[int],
+        drop_path: list,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        norm_layer: Type[LayerNorm] = nn.LayerNorm,  # type: ignore
+        downsample: isinstance = None,  # type: ignore
+        use_checkpoint: bool = False,
+    ) -> None:
+        """
+        Args:
+            dim: number of feature channels.
+            depths: number of layers in each stage.
+            num_heads: number of attention heads.
+            window_size: local window size.
+            drop_path: stochastic depth rate.
+            mlp_ratio: ratio of mlp hidden dim to embedding dim.
+            qkv_bias: add a learnable bias to query, key, value.
+            drop: dropout rate.
+            attn_drop: attention dropout rate.
+            norm_layer: normalization layer.
+            downsample: downsample layer at the end of the layer.
+            use_checkpoint: use gradient checkpointing for reduced memory usage.
+        """
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = tuple(i // 2 for i in window_size)
+        self.no_shift = tuple(0 for i in window_size)
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=self.window_size,
+                    shift_size=self.no_shift if (i % 2 == 0) else self.shift_size,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                    use_checkpoint=use_checkpoint,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer, spatial_dims=len(self.window_size))
+    def forward(self, x):
+        x_shape = x.size()
+        if len(x_shape) == 5:
+            b, c, d, h, w = x_shape
+            window_size, shift_size = get_window_size((d, h, w), self.window_size, self.shift_size)
+            x = rearrange(x, "b c d h w -> b d h w c")
+            dp = int(np.ceil(d / window_size[0])) * window_size[0]
+            hp = int(np.ceil(h / window_size[1])) * window_size[1]
+            wp = int(np.ceil(w / window_size[2])) * window_size[2]
+            attn_mask = compute_mask([dp, hp, wp], window_size, shift_size, x.device)
+            for blk in self.blocks:
+                x = blk(x, attn_mask)
+            x = x.view(b, d, h, w, -1)
+            if self.downsample is not None:
+                x = self.downsample(x)
+            x = rearrange(x, "b d h w c -> b c d h w")
+        elif len(x_shape) == 4:
+            b, c, h, w = x_shape
+            window_size, shift_size = get_window_size((h, w), self.window_size, self.shift_size)
+            x = rearrange(x, "b c h w -> b h w c")
+            hp = int(np.ceil(h / window_size[0])) * window_size[0]
+            wp = int(np.ceil(w / window_size[1])) * window_size[1]
+            attn_mask = compute_mask([hp, wp], window_size, shift_size, x.device)
+            for blk in self.blocks:
+                x = blk(x, attn_mask)
+            x = x.view(b, h, w, -1)
+            if self.downsample is not None:
+                x = self.downsample(x)
+            x = rearrange(x, "b h w c -> b c h w")
+        return x
+class SwinTransformer(nn.Module):
+    """
+    Swin Transformer based on: "Liu et al.,
+    Swin Transformer: Hierarchical Vision Transformer using Shifted Windows
+    <https://arxiv.org/abs/2103.14030>"
+    https://github.com/microsoft/Swin-Transformer
+    """
+    def __init__(
+        self,
+        in_chans: int,
+        embed_dim: int,
+        window_size: Sequence[int],
+        patch_size: Sequence[int],
+        depths: Sequence[int],
+        num_heads: Sequence[int],
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        norm_layer: Type[LayerNorm] = nn.LayerNorm,  # type: ignore
+        patch_norm: bool = False,
+        use_checkpoint: bool = False,
+        spatial_dims: int = 3,
+    ) -> None:
+        """
+        Args:
+            in_chans: dimension of input channels.
+            embed_dim: number of linear projection output channels.
+            window_size: local window size.
+            patch_size: patch size.
+            depths: number of layers in each stage.
+            num_heads: number of attention heads.
+            mlp_ratio: ratio of mlp hidden dim to embedding dim.
+            qkv_bias: add a learnable bias to query, key, value.
+            drop_rate: dropout rate.
+            attn_drop_rate: attention dropout rate.
+            drop_path_rate: stochastic depth rate.
+            norm_layer: normalization layer.
+            patch_norm: add normalization after patch embedding.
+            use_checkpoint: use gradient checkpointing for reduced memory usage.
+            spatial_dims: spatial dimension.
+        """
+        super().__init__()
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.window_size = window_size
+        self.patch_size = patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=self.patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,  # type: ignore
+            spatial_dims=spatial_dims,
+        )
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        # self.layers1 = nn.ModuleList()
+        # self.layers2 = nn.ModuleList()
+        # self.layers3 = nn.ModuleList()
+        # self.layers4 = nn.ModuleList()
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=self.window_size,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                norm_layer=norm_layer,
+                downsample=PatchMerging,
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+            # if i_layer == 0:
+            #     self.layers1.append(layer)
+            # elif i_layer == 1:
+            #     self.layers2.append(layer)
+            # elif i_layer == 2:
+            #     self.layers3.append(layer)
+            # elif i_layer == 3:
+            #     self.layers4.append(layer)
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+    def proj_out(self, x, normalize=False):
+        if normalize:
+            x_shape = x.size()
+            if len(x_shape) == 5:
+                n, ch, d, h, w = x_shape
+                x = rearrange(x, "n c d h w -> n d h w c")
+                x = F.layer_norm(x, [ch])
+                x = rearrange(x, "n d h w c -> n c d h w")
+            elif len(x_shape) == 4:
+                n, ch, h, w = x_shape
+                x = rearrange(x, "n c h w -> n h w c")
+                x = F.layer_norm(x, [ch])
+                x = rearrange(x, "n h w c -> n c h w")
+        return x
+    def forward(self, x, normalize=True):
+        # x input: [B*sample, C(1), H, W, D]
+        # x = rearrange(x, "b c h w d -> b c d h w")
+        # print('>> input: ', x.shape)
+        x = self.patch_embed(x)
+        # print('>> patch_embed: ', x.shape)
+        x = self.pos_drop(x)
+        for layer in self.layers:
+            x = layer(x.contiguous())
+            # print('>> layer: ', x.shape)
+        return x
+        # # x0_out = self.proj_out(x0, normalize)
+        # x1 = self.layers1[0](x0.contiguous())
+        # # x1_out = self.proj_out(x1, normalize)
+        # x2 = self.layers2[0](x1.contiguous())
+        # # x2_out = self.proj_out(x2, normalize)
+        # x3 = self.layers3[0](x2.contiguous())
+        # # x3_out = self.proj_out(x3, normalize)
+        # x4 = self.layers4[0](x3.contiguous())
+        # # x4_out = self.proj_out(x4, normalize)
+        # # return [x0_out, x1_out, x2_out, x3_out, x4_out]

model/segment_anything_volumetric/modeling/.ipynb_checkpoints/prompt_encoder-checkpoint.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+from torch import nn
+from typing import Any, Optional, Tuple, Type
+from .common import LayerNorm2d
+import os
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int, int],
+        input_image_size: Tuple[int, int, int],
+        mask_in_chans: int,
+        activation: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+          mask_in_chans (int): The number of hidden channels used for
+            encoding input masks.
+          activation (nn.Module): The activation to use when encoding
+            input masks.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
+        point_embeddings = [nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)]
+        self.point_embeddings = nn.ModuleList(point_embeddings)
+        self.not_a_point_embed = nn.Embedding(1, embed_dim)
+        self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1], 4 * image_embedding_size[2])
+        self.mask_downscaling = nn.Sequential(
+            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans // 4),
+            activation(),
+            nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans),
+            activation(),
+            nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+        )
+        self.no_mask_embed = nn.Embedding(1, embed_dim)
+    def get_dense_pe(self) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+        pad: bool,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            padding_point = torch.zeros((points.shape[0], 1, 3), device=points.device)
+            padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+            points = torch.cat([points, padding_point], dim=1)
+            labels = torch.cat([labels, padding_label], dim=1)
+        point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size)
+        point_embedding[labels == -1] = 0.0
+        point_embedding[labels == -1] += self.not_a_point_embed.weight
+        point_embedding[labels == 0] += self.point_embeddings[0].weight
+        point_embedding[labels == 1] += self.point_embeddings[1].weight
+        return point_embedding
+    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        coords = boxes.reshape(-1, 2, 3)
+        corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size)
+        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+        return corner_embedding
+    def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+        """Embeds mask inputs."""
+        mask_embedding = self.mask_downscaling(masks)
+        return mask_embedding
+    def _get_batch_size(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+        text_embedding: Optional[torch.Tensor],
+    ) -> int:
+        """
+        Gets the batch size of the output given the batch size of the input prompts.
+        """
+        if points is not None:
+            return points[0].shape[0]
+        elif boxes is not None:
+            return boxes.shape[0]
+        elif masks is not None:
+            return masks.shape[0]
+        elif text_embedding is not None:
+            return text_embedding.shape[0]
+        else:
+            return 1
+    def _get_device(self) -> torch.device:
+        return self.point_embeddings[0].weight.device
+    def forward(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+        text_embedding: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+        Arguments:
+          points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+            and labels to embed.
+          boxes (torch.Tensor or none): boxes to embed
+          masks (torch.Tensor or none): masks to embed
+          text: test prompt (B, 768)
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+          torch.Tensor: dense embeddings for the masks, in the shape
+            Bx(embed_dim)x(embed_H)x(embed_W)
+        """
+        # print('prompt encoder here...')
+        bs = self._get_batch_size(points, boxes, masks, text_embedding)
+        sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device())
+        # print('sparse_embeddings ', sparse_embeddings.shape)
+        if points is not None:
+            coords, labels = points
+            point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
+            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+        if boxes is not None:
+            box_embeddings = self._embed_boxes(boxes)
+            sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
+        if text_embedding is not None:
+            sparse_embeddings = torch.cat([sparse_embeddings, text_embedding.unsqueeze(dim=1)], dim=1)
+        # print('box_embeddings ', box_embeddings.shape)
+        # print('sparse_embeddings after box/point/text', sparse_embeddings.shape)
+        if masks is not None:
+            dense_embeddings = self._embed_masks(masks)
+        else:
+            dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1, 1).expand(
+                bs, -1, self.image_embedding_size[0], self.image_embedding_size[1], self.image_embedding_size[2]
+            )
+        # print('dense_embeddings ', dense_embeddings.shape)
+        return sparse_embeddings, dense_embeddings
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((3, num_pos_feats)),
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w, d = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w, d), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        z_embed = grid.cumsum(dim=2) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        z_embed = z_embed / d
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed, z_embed], dim=-1))
+        return pe.permute(3, 0, 1, 2)  # C x H x W x D
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        coords[:, :, 2] = coords[:, :, 2] / image_size[2]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C

model/segment_anything_volumetric/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .sam import Sam
+from .image_encoder import ImageEncoderViT
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+from .transformer import TwoWayTransformer

model/segment_anything_volumetric/modeling/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (394 Bytes). View file

model/segment_anything_volumetric/modeling/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (424 Bytes). View file

model/segment_anything_volumetric/modeling/__pycache__/common.cpython-310.pyc ADDED Viewed

Binary file (1.75 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/common.cpython-39.pyc ADDED Viewed

Binary file (1.77 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/image_encoder.cpython-310.pyc ADDED Viewed

Binary file (12.6 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/image_encoder.cpython-39.pyc ADDED Viewed

Binary file (11.4 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/image_encoder_swin.cpython-39.pyc ADDED Viewed

Binary file (21.5 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/mask_decoder.cpython-310.pyc ADDED Viewed

Binary file (5.5 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/mask_decoder.cpython-39.pyc ADDED Viewed

Binary file (6.09 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/prompt_encoder.cpython-310.pyc ADDED Viewed

Binary file (7.68 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/prompt_encoder.cpython-39.pyc ADDED Viewed

Binary file (8.01 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/sam.cpython-310.pyc ADDED Viewed

Binary file (6.66 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/sam.cpython-39.pyc ADDED Viewed

Binary file (6.67 kB). View file

model/segment_anything_volumetric/modeling/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (6.6 kB). View file