Jingkang Yang commited on
Commit
ad8dd60
1 Parent(s): a88b3fb

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +3 -0
  3. CODE_OF_CONDUCT.md +80 -0
  4. CONTRIBUTING.md +32 -0
  5. GETTING_STARTED.md +99 -0
  6. INSTALL.md +50 -0
  7. LICENSE +399 -0
  8. UI/sailvos3d/ex1/inputs/depth_000160.npy +3 -0
  9. UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz +3 -0
  10. UI/sailvos3d/ex1/inputs/rgb_000160.bmp +3 -0
  11. UI/sailvos3d/ex2/inputs/depth_000540.npy +3 -0
  12. UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz +3 -0
  13. UI/sailvos3d/ex2/inputs/rgb_000540.bmp +3 -0
  14. __pycache__/ui.cpython-39.pyc +0 -0
  15. app.py +194 -0
  16. configs/ovseg_swinB_vitL_bs32_120k.yaml +100 -0
  17. configs/ovseg_swinB_vitL_demo.yaml +99 -0
  18. datasets/DATASETS.md +122 -0
  19. datasets/prepare_ade20k_full_sem_seg.py +1011 -0
  20. datasets/prepare_ade20k_sem_seg.py +35 -0
  21. datasets/prepare_coco_stuff_sem_seg.py +219 -0
  22. datasets/prepare_pascal_context.py +69 -0
  23. datasets/prepare_voc_sem_seg.py +71 -0
  24. demo.py +123 -0
  25. flagged/log.csv +3 -0
  26. flagged/output/tmpii192qpn.png +0 -0
  27. flagged/output/tmpqm122tsi.png +0 -0
  28. open_vocab_seg/__init__.py +9 -0
  29. open_vocab_seg/__pycache__/__init__.cpython-39.pyc +0 -0
  30. open_vocab_seg/__pycache__/config.cpython-39.pyc +0 -0
  31. open_vocab_seg/__pycache__/mask_former_model.cpython-39.pyc +0 -0
  32. open_vocab_seg/__pycache__/ovseg_model.cpython-39.pyc +0 -0
  33. open_vocab_seg/__pycache__/test_time_augmentation.cpython-39.pyc +0 -0
  34. open_vocab_seg/config.py +133 -0
  35. open_vocab_seg/data/__init__.py +9 -0
  36. open_vocab_seg/data/__pycache__/__init__.cpython-39.pyc +0 -0
  37. open_vocab_seg/data/__pycache__/build.cpython-39.pyc +0 -0
  38. open_vocab_seg/data/augmentations.py +202 -0
  39. open_vocab_seg/data/build.py +344 -0
  40. open_vocab_seg/data/dataset_mappers/__init__.py +4 -0
  41. open_vocab_seg/data/dataset_mappers/__pycache__/__init__.cpython-39.pyc +0 -0
  42. open_vocab_seg/data/dataset_mappers/__pycache__/mask_former_semantic_dataset_mapper.cpython-39.pyc +0 -0
  43. open_vocab_seg/data/dataset_mappers/mask_former_semantic_dataset_mapper.py +208 -0
  44. open_vocab_seg/data/datasets/__init__.py +5 -0
  45. open_vocab_seg/data/datasets/__pycache__/__init__.cpython-39.pyc +0 -0
  46. open_vocab_seg/data/datasets/__pycache__/register_ade20k_full.cpython-39.pyc +0 -0
  47. open_vocab_seg/data/datasets/__pycache__/register_cc3m.cpython-39.pyc +0 -0
  48. open_vocab_seg/data/datasets/__pycache__/register_coco_stuff.cpython-39.pyc +0 -0
  49. open_vocab_seg/data/datasets/__pycache__/register_pascal_context.cpython-39.pyc +0 -0
  50. open_vocab_seg/data/datasets/__pycache__/register_voc_seg.cpython-39.pyc +0 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.bmp filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.pth
2
+ *.zip
3
+ outputs/*
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to make participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies within all project spaces, and it also applies when
49
+ an individual is representing the project or its community in public spaces.
50
+ Examples of representing a project or community include using an official
51
+ project e-mail address, posting via an official social media account, or acting
52
+ as an appointed representative at an online or offline event. Representation of
53
+ a project may be further defined and clarified by project maintainers.
54
+
55
+ This Code of Conduct also applies outside the project spaces when there is a
56
+ reasonable belief that an individual's behavior may have a negative impact on
57
+ the project or its community.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported by contacting the project team at <opensource-conduct@fb.com>. All
63
+ complaints will be reviewed and investigated and will result in a response that
64
+ is deemed necessary and appropriate to the circumstances. The project team is
65
+ obligated to maintain confidentiality with regard to the reporter of an incident.
66
+ Further details of specific enforcement policies may be posted separately.
67
+
68
+ Project maintainers who do not follow or enforce the Code of Conduct in good
69
+ faith may face temporary or permanent repercussions as determined by other
70
+ members of the project's leadership.
71
+
72
+ ## Attribution
73
+
74
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75
+ available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76
+
77
+ [homepage]: https://www.contributor-covenant.org
78
+
79
+ For answers to common questions about this code of conduct, see
80
+ https://www.contributor-covenant.org/faq
CONTRIBUTING.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to OVSeg
2
+ We want to make contributing to this project as easy and transparent as
3
+ possible.
4
+
5
+ ## Pull Requests
6
+ We actively welcome your pull requests.
7
+
8
+ 1. Fork the repo and create your branch from `main`.
9
+ 2. If you've added code that should be tested, add tests.
10
+ 3. If you've changed APIs, update the documentation.
11
+ 4. Ensure the test suite passes.
12
+ 5. Make sure your code lints.
13
+ 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14
+
15
+ ## Contributor License Agreement ("CLA")
16
+ In order to accept your pull request, we need you to submit a CLA. You only need
17
+ to do this once to work on any of Meta's open source projects.
18
+
19
+ Complete your CLA here: <https://code.facebook.com/cla>
20
+
21
+ ## Issues
22
+ We use GitHub issues to track public bugs. Please ensure your description is
23
+ clear and has sufficient instructions to be able to reproduce the issue.
24
+
25
+ Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26
+ disclosure of security bugs. In those cases, please go through the process
27
+ outlined on that page and do not file a public issue.
28
+
29
+
30
+ ## License
31
+ By contributing to OVSeg, you agree that your contributions will be licensed
32
+ under the LICENSE file in the root directory of this source tree.
GETTING_STARTED.md ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Getting started with OVSeg
2
+
3
+
4
+ ### Try demo
5
+
6
+ We release our largest model (Swin-Base + CLIP-ViT-L/14) [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) (md5: <tt>526080</tt>).
7
+
8
+ - Test on sample image
9
+ ```bash
10
+ python demo.py --config-file configs/ovseg_swinB_vitL_demo.yaml --class-names 'Oculus' 'Ukulele' --input ./resources/demo_samples/sample_03.jpeg --output ./pred --opts MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth
11
+ ```
12
+
13
+ ### Evaluation with pre-trained weights
14
+
15
+ We release our largest model (Swin-Base + CLIP-ViT-L/14) [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) (md5: <tt>526080</tt>).
16
+
17
+ - Test on ADE20K-150 and ADE-847
18
+ ```bash
19
+ python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\)
20
+ ```
21
+
22
+ - Test on PascalContext-59 and PascalContext-459
23
+ ```bash
24
+ python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT 0.6 DATASETS.TEST \(\"pascal_context_59_sem_seg_val\",\"pascal_context_459_sem_seg_val\",\)
25
+ ```
26
+
27
+ - Test on PascalVOC-20
28
+ ```bash
29
+ python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT 0.45 DATASETS.TEST \(\"pascalvoc20_sem_seg_val\",\)
30
+ ```
31
+
32
+ #### Performance benchmark
33
+
34
+ | method | backbone | training dataset | A-847 | PC-459 | A-150 | PC-59 | PAS-20 |
35
+ |------------------------------------|----------|------------------|:-----:|:------:|:-----:|:-----:|:------:|
36
+ | Open-vocabulary generalist models. | | | | | | | |
37
+ | SPNet | R-101 | PASCAL-15 | - | - | - | 24.3 | 18.3 |
38
+ | ZS3Net | R-101 | PASCAL-15 | - | - | - | 19.4 | 38.3 |
39
+ | LSeg | R-101 | PASCAL-15 | - | - | - | - | 47.4 |
40
+ | LSeg+ | R-101 | COCO Panoptic | 2.5 | 5.2 | 13.0 | 36.0 | 59.0 |
41
+ | SimBaseline | R-101c | COCO-Stuff-156 | - | - | 15.3 | - | 74.5 |
42
+ | ZegFormer | R-50 | COCO-Stuff-156 | - | - | 16.4 | - | 80.7 |
43
+ | OpenSeg | R-101 | COCO Panoptic | 4.0 | 6.5 | 15.3 | 36.9 | 60.0 |
44
+ | OVSeg (Ours) | R-101c | COCO-Stuff-171 | 7.1 | 11.0 | 24.8 | 53.3 | 92.6 |
45
+ | LSeg+ | Eff-B7 | COCO Panoptic | 3.8 | 7.8 | 18.0 | 46.5 | - |
46
+ | OpenSeg | Eff-B7 | COCO Panoptic | 6.3 | 9.0 | 21.1 | 42.1 | - |
47
+ | OVSeg (Ours) | Swin-B | COCO-Stuff-171 | 9.0 | 12.4 | 29.6 | 55.7 | 94.5 |
48
+ | Supervised specialist models. | | | | | | | |
49
+ | FCN | FCN-8s | Same as test | - | - | 29.4 | 37.8 | - |
50
+ | Deeplab | R-101 | Same as test | - | - | - | 45.7 | 77.7 |
51
+ | SelfTrain | Eff-L2 | Same as test | - | - | - | - | 90.0 |
52
+
53
+ #### Ablation study
54
+
55
+ - Mask prompt tuning can bring significant improvement without changing CLIP weights (Table 3 in [paper](https://arxiv.org/pdf/2210.04150.pdf))
56
+
57
+ Download the checkpoint with mpt only [ovseg_swinbase_vitL14_mpt_only.pt](https://drive.google.com/file/d/1LJGWFjHw76OGDNy9r9KQIaACfIm9KMhQ/view?usp=sharing) (md5: <tt>2dd495</tt>).
58
+
59
+ ```bash
60
+ python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_mpt_only.pt DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\)
61
+ ```
62
+
63
+ - Mask prompt tuning can improve over fully finetuned model (Table 3 in [paper](https://arxiv.org/pdf/2210.04150.pdf))
64
+
65
+ With the same [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) checkpoint, set `MASK_PROMPT_FWD` as `False`
66
+
67
+ ```bash
68
+ python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD False MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\)
69
+ ```
70
+
71
+ - The effects of class prediction ensemble (Table 6 in [paper](https://arxiv.org/pdf/2210.04150.pdf))
72
+
73
+ With the same [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) checkpoint, set `CLIP_ENSEMBLE` as `False`.
74
+
75
+ ```bash
76
+ python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE False MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\)
77
+ ```
78
+
79
+ ### Training Segmentation model
80
+
81
+ Our model is trained on COCO-Stuff
82
+
83
+ - Training baseline w/ original CLIP
84
+ ```
85
+ python train_net.py --num-gpu 8 --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD False
86
+ ```
87
+
88
+ To reproduce our final results, you may want to use the our mask-adapted CLIP
89
+
90
+ - Training ovseg w/ mask-adapted CLIP
91
+ ```
92
+ python train_net.py --num-gpu 8 --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME #PATH_TO_MASKADAPTED_CLIP
93
+ ```
94
+
95
+ CAUTION: The final results is sensitive to the ensemble (appendix A.5 in [paper](https://arxiv.org/pdf/2210.04150.pdf)). Thus, you may want to use the ```tools/search_thr_ensemble_w.sh``` to find the best ensemble hyper-parameters.
96
+
97
+ ### Fine-tuning CLIP with collected mask-category pairs
98
+
99
+ We are still working on this part, stay tuned!
INSTALL.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Installation
2
+
3
+ ### Requirements
4
+ - Linux with Python ≥ 3.8
5
+ - PyTorch ≥ 1.8 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
6
+ Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
7
+ PyTorch version matches that is required by Detectron2.
8
+ - PyTorch3d: follow [Pytorch3d installation instructions](https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md).
9
+ - Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
10
+ - Segment Anything Model: follow [SAM](https://github.com/facebookresearch/segment-anything).
11
+
12
+ ### Usage
13
+
14
+ Install required packages.
15
+
16
+ ```bash
17
+ conda create --name ovseg python=3.8
18
+ conda activate ovseg
19
+ conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge
20
+ conda install -c fvcore -c iopath -c conda-forge fvcore iopath
21
+ conda install pytorch3d -c pytorch3d
22
+ pip install -r requirements.txt
23
+ ```
24
+
25
+ You need to download `detectron2==0.6` following [instructions](https://detectron2.readthedocs.io/en/latest/tutorials/install.html)
26
+
27
+ ```bash
28
+ python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
29
+ ```
30
+
31
+ If you cannot succefully install `pycocotools`, try this from [here](https://github.com/cocodataset/cocoapi/issues/351):
32
+ ```bash
33
+ conda install -c conda-forge pycocotools
34
+ ```
35
+
36
+ Install the SAM with:
37
+ ```bash
38
+ pip install git+https://github.com/facebookresearch/segment-anything.git
39
+ ```
40
+ To fully support the SAM, install these packages:
41
+ ```bash
42
+ pip install opencv-python pycocotools matplotlib onnxruntime onnx
43
+ ```
44
+
45
+ FurtherMore, install the modified clip package.
46
+
47
+ ```bash
48
+ cd third_party/CLIP
49
+ python -m pip install -Ue .
50
+ ```
LICENSE ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution-NonCommercial 4.0 International
2
+
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More_considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution-NonCommercial 4.0 International Public
58
+ License
59
+
60
+ By exercising the Licensed Rights (defined below), You accept and agree
61
+ to be bound by the terms and conditions of this Creative Commons
62
+ Attribution-NonCommercial 4.0 International Public License ("Public
63
+ License"). To the extent this Public License may be interpreted as a
64
+ contract, You are granted the Licensed Rights in consideration of Your
65
+ acceptance of these terms and conditions, and the Licensor grants You
66
+ such rights in consideration of benefits the Licensor receives from
67
+ making the Licensed Material available under these terms and
68
+ conditions.
69
+
70
+ Section 1 -- Definitions.
71
+
72
+ a. Adapted Material means material subject to Copyright and Similar
73
+ Rights that is derived from or based upon the Licensed Material
74
+ and in which the Licensed Material is translated, altered,
75
+ arranged, transformed, or otherwise modified in a manner requiring
76
+ permission under the Copyright and Similar Rights held by the
77
+ Licensor. For purposes of this Public License, where the Licensed
78
+ Material is a musical work, performance, or sound recording,
79
+ Adapted Material is always produced where the Licensed Material is
80
+ synched in timed relation with a moving image.
81
+
82
+ b. Adapter's License means the license You apply to Your Copyright
83
+ and Similar Rights in Your contributions to Adapted Material in
84
+ accordance with the terms and conditions of this Public License.
85
+
86
+ c. Copyright and Similar Rights means copyright and/or similar rights
87
+ closely related to copyright including, without limitation,
88
+ performance, broadcast, sound recording, and Sui Generis Database
89
+ Rights, without regard to how the rights are labeled or
90
+ categorized. For purposes of this Public License, the rights
91
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
92
+ Rights.
93
+ d. Effective Technological Measures means those measures that, in the
94
+ absence of proper authority, may not be circumvented under laws
95
+ fulfilling obligations under Article 11 of the WIPO Copyright
96
+ Treaty adopted on December 20, 1996, and/or similar international
97
+ agreements.
98
+
99
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
100
+ any other exception or limitation to Copyright and Similar Rights
101
+ that applies to Your use of the Licensed Material.
102
+
103
+ f. Licensed Material means the artistic or literary work, database,
104
+ or other material to which the Licensor applied this Public
105
+ License.
106
+
107
+ g. Licensed Rights means the rights granted to You subject to the
108
+ terms and conditions of this Public License, which are limited to
109
+ all Copyright and Similar Rights that apply to Your use of the
110
+ Licensed Material and that the Licensor has authority to license.
111
+
112
+ h. Licensor means the individual(s) or entity(ies) granting rights
113
+ under this Public License.
114
+
115
+ i. NonCommercial means not primarily intended for or directed towards
116
+ commercial advantage or monetary compensation. For purposes of
117
+ this Public License, the exchange of the Licensed Material for
118
+ other material subject to Copyright and Similar Rights by digital
119
+ file-sharing or similar means is NonCommercial provided there is
120
+ no payment of monetary compensation in connection with the
121
+ exchange.
122
+
123
+ j. Share means to provide material to the public by any means or
124
+ process that requires permission under the Licensed Rights, such
125
+ as reproduction, public display, public performance, distribution,
126
+ dissemination, communication, or importation, and to make material
127
+ available to the public including in ways that members of the
128
+ public may access the material from a place and at a time
129
+ individually chosen by them.
130
+
131
+ k. Sui Generis Database Rights means rights other than copyright
132
+ resulting from Directive 96/9/EC of the European Parliament and of
133
+ the Council of 11 March 1996 on the legal protection of databases,
134
+ as amended and/or succeeded, as well as other essentially
135
+ equivalent rights anywhere in the world.
136
+
137
+ l. You means the individual or entity exercising the Licensed Rights
138
+ under this Public License. Your has a corresponding meaning.
139
+
140
+ Section 2 -- Scope.
141
+
142
+ a. License grant.
143
+
144
+ 1. Subject to the terms and conditions of this Public License,
145
+ the Licensor hereby grants You a worldwide, royalty-free,
146
+ non-sublicensable, non-exclusive, irrevocable license to
147
+ exercise the Licensed Rights in the Licensed Material to:
148
+
149
+ a. reproduce and Share the Licensed Material, in whole or
150
+ in part, for NonCommercial purposes only; and
151
+
152
+ b. produce, reproduce, and Share Adapted Material for
153
+ NonCommercial purposes only.
154
+
155
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
156
+ Exceptions and Limitations apply to Your use, this Public
157
+ License does not apply, and You do not need to comply with
158
+ its terms and conditions.
159
+
160
+ 3. Term. The term of this Public License is specified in Section
161
+ 6(a).
162
+
163
+ 4. Media and formats; technical modifications allowed. The
164
+ Licensor authorizes You to exercise the Licensed Rights in
165
+ all media and formats whether now known or hereafter created,
166
+ and to make technical modifications necessary to do so. The
167
+ Licensor waives and/or agrees not to assert any right or
168
+ authority to forbid You from making technical modifications
169
+ necessary to exercise the Licensed Rights, including
170
+ technical modifications necessary to circumvent Effective
171
+ Technological Measures. For purposes of this Public License,
172
+ simply making modifications authorized by this Section 2(a)
173
+ (4) never produces Adapted Material.
174
+
175
+ 5. Downstream recipients.
176
+
177
+ a. Offer from the Licensor -- Licensed Material. Every
178
+ recipient of the Licensed Material automatically
179
+ receives an offer from the Licensor to exercise the
180
+ Licensed Rights under the terms and conditions of this
181
+ Public License.
182
+
183
+ b. No downstream restrictions. You may not offer or impose
184
+ any additional or different terms or conditions on, or
185
+ apply any Effective Technological Measures to, the
186
+ Licensed Material if doing so restricts exercise of the
187
+ Licensed Rights by any recipient of the Licensed
188
+ Material.
189
+
190
+ 6. No endorsement. Nothing in this Public License constitutes or
191
+ may be construed as permission to assert or imply that You
192
+ are, or that Your use of the Licensed Material is, connected
193
+ with, or sponsored, endorsed, or granted official status by,
194
+ the Licensor or others designated to receive attribution as
195
+ provided in Section 3(a)(1)(A)(i).
196
+
197
+ b. Other rights.
198
+
199
+ 1. Moral rights, such as the right of integrity, are not
200
+ licensed under this Public License, nor are publicity,
201
+ privacy, and/or other similar personality rights; however, to
202
+ the extent possible, the Licensor waives and/or agrees not to
203
+ assert any such rights held by the Licensor to the limited
204
+ extent necessary to allow You to exercise the Licensed
205
+ Rights, but not otherwise.
206
+
207
+ 2. Patent and trademark rights are not licensed under this
208
+ Public License.
209
+
210
+ 3. To the extent possible, the Licensor waives any right to
211
+ collect royalties from You for the exercise of the Licensed
212
+ Rights, whether directly or through a collecting society
213
+ under any voluntary or waivable statutory or compulsory
214
+ licensing scheme. In all other cases the Licensor expressly
215
+ reserves any right to collect such royalties, including when
216
+ the Licensed Material is used other than for NonCommercial
217
+ purposes.
218
+
219
+ Section 3 -- License Conditions.
220
+
221
+ Your exercise of the Licensed Rights is expressly made subject to the
222
+ following conditions.
223
+
224
+ a. Attribution.
225
+
226
+ 1. If You Share the Licensed Material (including in modified
227
+ form), You must:
228
+
229
+ a. retain the following if it is supplied by the Licensor
230
+ with the Licensed Material:
231
+
232
+ i. identification of the creator(s) of the Licensed
233
+ Material and any others designated to receive
234
+ attribution, in any reasonable manner requested by
235
+ the Licensor (including by pseudonym if
236
+ designated);
237
+
238
+ ii. a copyright notice;
239
+
240
+ iii. a notice that refers to this Public License;
241
+
242
+ iv. a notice that refers to the disclaimer of
243
+ warranties;
244
+
245
+ v. a URI or hyperlink to the Licensed Material to the
246
+ extent reasonably practicable;
247
+
248
+ b. indicate if You modified the Licensed Material and
249
+ retain an indication of any previous modifications; and
250
+
251
+ c. indicate the Licensed Material is licensed under this
252
+ Public License, and include the text of, or the URI or
253
+ hyperlink to, this Public License.
254
+
255
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
256
+ reasonable manner based on the medium, means, and context in
257
+ which You Share the Licensed Material. For example, it may be
258
+ reasonable to satisfy the conditions by providing a URI or
259
+ hyperlink to a resource that includes the required
260
+ information.
261
+
262
+ 3. If requested by the Licensor, You must remove any of the
263
+ information required by Section 3(a)(1)(A) to the extent
264
+ reasonably practicable.
265
+
266
+ 4. If You Share Adapted Material You produce, the Adapter's
267
+ License You apply must not prevent recipients of the Adapted
268
+ Material from complying with this Public License.
269
+
270
+ Section 4 -- Sui Generis Database Rights.
271
+
272
+ Where the Licensed Rights include Sui Generis Database Rights that
273
+ apply to Your use of the Licensed Material:
274
+
275
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
276
+ to extract, reuse, reproduce, and Share all or a substantial
277
+ portion of the contents of the database for NonCommercial purposes
278
+ only;
279
+
280
+ b. if You include all or a substantial portion of the database
281
+ contents in a database in which You have Sui Generis Database
282
+ Rights, then the database in which You have Sui Generis Database
283
+ Rights (but not its individual contents) is Adapted Material; and
284
+
285
+ c. You must comply with the conditions in Section 3(a) if You Share
286
+ all or a substantial portion of the contents of the database.
287
+
288
+ For the avoidance of doubt, this Section 4 supplements and does not
289
+ replace Your obligations under this Public License where the Licensed
290
+ Rights include other Copyright and Similar Rights.
291
+
292
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
293
+
294
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
295
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
296
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
297
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
298
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
299
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
300
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
301
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
302
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
303
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
304
+
305
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
306
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
307
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
308
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
309
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
310
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
311
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
312
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
313
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
314
+
315
+ c. The disclaimer of warranties and limitation of liability provided
316
+ above shall be interpreted in a manner that, to the extent
317
+ possible, most closely approximates an absolute disclaimer and
318
+ waiver of all liability.
319
+
320
+ Section 6 -- Term and Termination.
321
+
322
+ a. This Public License applies for the term of the Copyright and
323
+ Similar Rights licensed here. However, if You fail to comply with
324
+ this Public License, then Your rights under this Public License
325
+ terminate automatically.
326
+
327
+ b. Where Your right to use the Licensed Material has terminated under
328
+ Section 6(a), it reinstates:
329
+
330
+ 1. automatically as of the date the violation is cured, provided
331
+ it is cured within 30 days of Your discovery of the
332
+ violation; or
333
+
334
+ 2. upon express reinstatement by the Licensor.
335
+
336
+ For the avoidance of doubt, this Section 6(b) does not affect any
337
+ right the Licensor may have to seek remedies for Your violations
338
+ of this Public License.
339
+
340
+ c. For the avoidance of doubt, the Licensor may also offer the
341
+ Licensed Material under separate terms or conditions or stop
342
+ distributing the Licensed Material at any time; however, doing so
343
+ will not terminate this Public License.
344
+
345
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
346
+ License.
347
+
348
+ Section 7 -- Other Terms and Conditions.
349
+
350
+ a. The Licensor shall not be bound by any additional or different
351
+ terms or conditions communicated by You unless expressly agreed.
352
+
353
+ b. Any arrangements, understandings, or agreements regarding the
354
+ Licensed Material not stated herein are separate from and
355
+ independent of the terms and conditions of this Public License.
356
+
357
+ Section 8 -- Interpretation.
358
+
359
+ a. For the avoidance of doubt, this Public License does not, and
360
+ shall not be interpreted to, reduce, limit, restrict, or impose
361
+ conditions on any use of the Licensed Material that could lawfully
362
+ be made without permission under this Public License.
363
+
364
+ b. To the extent possible, if any provision of this Public License is
365
+ deemed unenforceable, it shall be automatically reformed to the
366
+ minimum extent necessary to make it enforceable. If the provision
367
+ cannot be reformed, it shall be severed from this Public License
368
+ without affecting the enforceability of the remaining terms and
369
+ conditions.
370
+
371
+ c. No term or condition of this Public License will be waived and no
372
+ failure to comply consented to unless expressly agreed to by the
373
+ Licensor.
374
+
375
+ d. Nothing in this Public License constitutes or may be interpreted
376
+ as a limitation upon, or waiver of, any privileges and immunities
377
+ that apply to the Licensor or You, including from the legal
378
+ processes of any jurisdiction or authority.
379
+
380
+ =======================================================================
381
+
382
+ Creative Commons is not a party to its public
383
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
384
+ its public licenses to material it publishes and in those instances
385
+ will be considered the “Licensor.” The text of the Creative Commons
386
+ public licenses is dedicated to the public domain under the CC0 Public
387
+ Domain Dedication. Except for the limited purpose of indicating that
388
+ material is shared under a Creative Commons public license or as
389
+ otherwise permitted by the Creative Commons policies published at
390
+ creativecommons.org/policies, Creative Commons does not authorize the
391
+ use of the trademark "Creative Commons" or any other trademark or logo
392
+ of Creative Commons without its prior written consent including,
393
+ without limitation, in connection with any unauthorized modifications
394
+ to any of its public licenses or any other arrangements,
395
+ understandings, or agreements concerning use of licensed material. For
396
+ the avoidance of doubt, this paragraph does not form part of the
397
+ public licenses.
398
+
399
+ Creative Commons may be contacted at creativecommons.org.
UI/sailvos3d/ex1/inputs/depth_000160.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96d4969b8b33250785d1996b1536bb9026536f420391c68255e326990138598e
3
+ size 4096128
UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5afc2fdf1faa9b7b5d7808bb703c82aa5ccbb3154e2f62b3cc4989a2dcc92fe5
3
+ size 1234
UI/sailvos3d/ex1/inputs/rgb_000160.bmp ADDED

Git LFS Details

  • SHA256: 1c461e0c0cf6049bd9984ccaedb8b8fb07a1df06462931d38fdcd952bb38805c
  • Pointer size: 132 Bytes
  • Size of remote file: 3.07 MB
UI/sailvos3d/ex2/inputs/depth_000540.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f24fcabda3f7fd17856c4105279f2842b631cc18579d273b87dd8f2cb39e7df6
3
+ size 4096128
UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb66f05ce4cdb6d6410bd3e34b70eeb07724810e70786249c30de0f50404fd64
3
+ size 1234
UI/sailvos3d/ex2/inputs/rgb_000540.bmp ADDED

Git LFS Details

  • SHA256: 1aa08869030d51751983bdab733f4f26342dc239abedb3195d3f4771d93701cf
  • Pointer size: 132 Bytes
  • Size of remote file: 3.07 MB
__pycache__/ui.cpython-39.pyc ADDED
Binary file (2.78 kB). View file
 
app.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ import argparse
5
+ import glob
6
+ import multiprocessing as mp
7
+ import os
8
+ import time
9
+ import cv2
10
+ import tqdm
11
+ import numpy as np
12
+ import gradio as gr
13
+
14
+ from detectron2.config import get_cfg
15
+
16
+ from detectron2.projects.deeplab import add_deeplab_config
17
+ from detectron2.data.detection_utils import read_image
18
+ from detectron2.utils.logger import setup_logger
19
+ from open_vocab_seg import add_ovseg_config
20
+
21
+ from open_vocab_seg.utils import VisualizationDemo
22
+
23
+ # constants
24
+ WINDOW_NAME = "Open vocabulary segmentation"
25
+
26
+
27
+ def setup_cfg(args):
28
+ # load config from file and command-line arguments
29
+ cfg = get_cfg()
30
+ # for poly lr schedule
31
+ add_deeplab_config(cfg)
32
+ add_ovseg_config(cfg)
33
+ cfg.merge_from_file(args.config_file)
34
+ cfg.merge_from_list(args.opts)
35
+ cfg.freeze()
36
+ return cfg
37
+
38
+
39
+ def get_parser():
40
+ parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation")
41
+ parser.add_argument(
42
+ "--config-file",
43
+ default="configs/ovseg_swinB_vitL_demo.yaml",
44
+ metavar="FILE",
45
+ help="path to config file",
46
+ )
47
+ parser.add_argument(
48
+ "--input",
49
+ default=["/mnt/lustre/jkyang/PSG4D/sailvos3d/downloads/sailvos3d/trevor_1_int/images/000160.bmp"],
50
+ nargs="+",
51
+ help="A list of space separated input images; "
52
+ "or a single glob pattern such as 'directory/*.jpg'",
53
+ )
54
+ parser.add_argument(
55
+ "--class-names",
56
+ default=["person", "car", "motorcycle", "truck", "bird", "dog", "handbag", "suitcase", "bottle", "cup", "bowl", "chair", "potted plant", "bed", "dining table", "tv", "laptop", "cell phone", "bag", "bin", "box", "door", "road barrier", "stick", "lamp", "floor", "wall"],
57
+ nargs="+",
58
+ help="A list of user-defined class_names"
59
+ )
60
+ parser.add_argument(
61
+ "--output",
62
+ default = "./pred",
63
+ help="A file or directory to save output visualizations. "
64
+ "If not given, will show output in an OpenCV window.",
65
+ )
66
+ parser.add_argument(
67
+ "--opts",
68
+ help="Modify config options using the command-line 'KEY VALUE' pairs",
69
+ default=["MODEL.WEIGHTS", "ovseg_swinbase_vitL14_ft_mpt.pth"],
70
+ nargs=argparse.REMAINDER,
71
+ )
72
+ return parser
73
+
74
+ args = get_parser().parse_args()
75
+
76
+ def greet(rgb_input, depth_map_input, rage_matrices_input, class_candidates):
77
+ print(args.class_names)
78
+ print(class_candidates[0], class_candidates[1], class_candidates[2], class_candidates[3],)
79
+ print(class_candidates.split(', '))
80
+ args.input = [rgb_input]
81
+ args.class_names = class_candidates.split(', ')
82
+ depth_map_path = depth_map_input.name
83
+ rage_matrices_path = rage_matrices_input.name
84
+ print(args.input, args.class_names, depth_map_path, rage_matrices_path)
85
+ mp.set_start_method("spawn", force=True)
86
+ setup_logger(name="fvcore")
87
+ logger = setup_logger()
88
+ logger.info("Arguments: " + str(args))
89
+
90
+ cfg = setup_cfg(args)
91
+
92
+ demo = VisualizationDemo(cfg)
93
+ class_names = args.class_names
94
+ print(args.input)
95
+ if args.input:
96
+ if len(args.input) == 1:
97
+ args.input = glob.glob(os.path.expanduser(args.input[0]))
98
+ assert args.input, "The input path(s) was not found"
99
+ for path in tqdm.tqdm(args.input, disable=not args.output):
100
+ # use PIL, to be consistent with evaluation
101
+ start_time = time.time()
102
+ predictions, visualized_output_rgb, visualized_output_depth, visualized_output_rgb_sam, visualized_output_depth_sam = demo.run_on_image_sam(path, class_names, depth_map_path, rage_matrices_path)
103
+ logger.info(
104
+ "{}: {} in {:.2f}s".format(
105
+ path,
106
+ "detected {} instances".format(len(predictions["instances"]))
107
+ if "instances" in predictions
108
+ else "finished",
109
+ time.time() - start_time,
110
+ )
111
+ )
112
+
113
+ if args.output:
114
+ if os.path.isdir(args.output):
115
+ assert os.path.isdir(args.output), args.output
116
+ out_filename = os.path.join(args.output, os.path.basename(path))
117
+ else:
118
+ assert len(args.input) == 1, "Please specify a directory with args.output"
119
+ out_filename = args.output
120
+ visualized_output_rgb.save('outputs/RGB_Semantic_SAM.png')
121
+ visualized_output_depth.save('outputs/Depth_Semantic_SAM.png')
122
+ visualized_output_rgb_sam.save('outputs/RGB_Semantic_SAM_Mask.png')
123
+ visualized_output_depth_sam.save('outputs/Depth_Semantic_SAM_Mask.png')
124
+ rgb_3d_sam = demo.get_xyzrgb('outputs/RGB_Semantic_SAM.png', depth_map_path, rage_matrices_path)
125
+ depth_3d_sam = demo.get_xyzrgb('outputs/Depth_Semantic_SAM.png', depth_map_path, rage_matrices_path)
126
+ rgb_3d_sam_mask = demo.get_xyzrgb('outputs/RGB_Semantic_SAM_Mask.png', depth_map_path, rage_matrices_path)
127
+ depth_3d_sam_mask = demo.get_xyzrgb('outputs/Depth_Semantic_SAM_Mask.png', depth_map_path, rage_matrices_path)
128
+ np.savez('outputs/xyzrgb.npz', rgb_3d_sam = rgb_3d_sam, depth_3d_sam = depth_3d_sam, rgb_3d_sam_mask = rgb_3d_sam_mask, depth_3d_sam_mask = depth_3d_sam_mask)
129
+ demo.render_3d_video('outputs/xyzrgb.npz', depth_map_path)
130
+ else:
131
+ cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
132
+ cv2.imshow(WINDOW_NAME, visualized_output_rgb.get_image()[:, :, ::-1])
133
+ if cv2.waitKey(0) == 27:
134
+ break # esc to quit
135
+ else:
136
+ raise NotImplementedError
137
+
138
+ Depth_Semantic_SAM_Mask = read_image('outputs/Depth_Semantic_SAM_Mask.png')
139
+ RGB_Semantic_SAM_Mask = read_image('outputs/RGB_Semantic_SAM_Mask.png')
140
+ Depth_map = read_image('outputs/Depth_rendered.png')
141
+ Depth_Semantic_SAM_Mask_gif = 'outputs/depth_3d_sam_mask.gif'
142
+ RGB_Semantic_SAM_Mask_gif = 'outputs/rgb_3d_sam_mask.gif'
143
+ return RGB_Semantic_SAM_Mask, RGB_Semantic_SAM_Mask_gif, Depth_map, Depth_Semantic_SAM_Mask, Depth_Semantic_SAM_Mask_gif
144
+
145
+ with gr.Blocks(analytics_enabled=False) as segrgbd_iface:
146
+ gr.Markdown("<div align='center'> <h2> Semantic Segment AnyRGBD </span> </h2> \
147
+ <a style='font-size:18px;color: #000000' href='https://github.com/Jun-CEN/SegmentAnyRGBD'> Github </div>")
148
+
149
+ gr.Markdown("<b> You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href='https://huggingface.co/spaces/VideoCrafter/VideoCrafter?duplicate=true'> <img src='https://bit.ly/3gLdBN6' alt='Duplicate Space'></a> </b>")
150
+ #######t2v#######
151
+ with gr.Tab(label="Dataset: Sailvos3D"):
152
+ with gr.Column():
153
+ with gr.Row():
154
+ # with gr.Tab(label='input'):
155
+ with gr.Column():
156
+ with gr.Row():
157
+ Input_RGB_Component = gr.Image(label = 'RGB_Input', type = 'filepath').style(width=320, height=200)
158
+ Depth_Map_Output_Component = gr.Image(label = "Depth_Map").style(width=320, height=200)
159
+ with gr.Row():
160
+ Depth_Map_Input_Component = gr.File(label = 'Depth_map')
161
+ Component_2D_to_3D_Projection_Parameters = gr.File(label = '2D_to_3D_Projection_Parameters')
162
+ with gr.Row():
163
+ Class_Candidates_Component = gr.Text(label = 'Class_Candidates')
164
+ vc_end_btn = gr.Button("Send")
165
+ with gr.Tab(label='Result'):
166
+ with gr.Row():
167
+ RGB_Semantic_SAM_Mask_Component = gr.Image(label = "RGB_Semantic_SAM_Mask").style(width=320, height=200)
168
+ RGB_Semantic_SAM_Mask_3D_Component = gr.Image(label = "3D_RGB_Semantic_SAM_Mask").style(width=320, height=200)
169
+ with gr.Row():
170
+ Depth_Semantic_SAM_Mask_Component = gr.Image(label = "Depth_Semantic_SAM_Mask").style(width=320, height=200)
171
+ Depth_Semantic_SAM_Mask_3D_Component = gr.Image(label = "3D_Depth_Semantic_SAM_Mask").style(width=320, height=200)
172
+ gr.Examples(examples=[
173
+ [
174
+ 'UI/sailvos3d/ex1/inputs/rgb_000160.bmp',
175
+ 'UI/sailvos3d/ex1/inputs/depth_000160.npy',
176
+ 'UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz',
177
+ 'person, car, motorcycle, truck, bird, dog, handbag, suitcase, bottle, cup, bowl, chair, potted plant, bed, dining table, tv, laptop, cell phone, bag, bin, box, door, road barrier, stick, lamp, floor, wall',
178
+ ],
179
+ [
180
+ 'UI/sailvos3d/ex2/inputs/rgb_000540.bmp',
181
+ 'UI/sailvos3d/ex2/inputs/depth_000540.npy',
182
+ 'UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz',
183
+ 'person, car, motorcycle, truck, bird, dog, handbag, suitcase, bottle, cup, bowl, chair, potted plant, bed, dining table, tv, laptop, cell phone, bag, bin, box, door, road barrier, stick, lamp, floor, wall',
184
+ ]],
185
+ inputs=[Input_RGB_Component, Depth_Map_Input_Component, Component_2D_to_3D_Projection_Parameters, Class_Candidates_Component],
186
+ outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
187
+ fn=greet)
188
+ vc_end_btn.click(inputs=[Input_RGB_Component, Depth_Map_Input_Component, Component_2D_to_3D_Projection_Parameters, Class_Candidates_Component],
189
+ outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
190
+ fn=greet)
191
+
192
+ demo = segrgbd_iface
193
+ demo.launch()
194
+
configs/ovseg_swinB_vitL_bs32_120k.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "OVSeg"
3
+ BACKBONE:
4
+ FREEZE_AT: 0
5
+ NAME: "D2SwinTransformer"
6
+ SWIN:
7
+ EMBED_DIM: 128
8
+ DEPTHS: [2, 2, 18, 2]
9
+ NUM_HEADS: [4, 8, 16, 32]
10
+ WINDOW_SIZE: 12
11
+ APE: False
12
+ DROP_PATH_RATE: 0.3
13
+ PATCH_NORM: True
14
+ PRETRAIN_IMG_SIZE: 384
15
+ WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
16
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
17
+ PIXEL_STD: [58.395, 57.120, 57.375]
18
+ SEM_SEG_HEAD:
19
+ NAME: "OpenVocabMaskFormerHead"
20
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
21
+ IGNORE_VALUE: 255
22
+ NUM_CLASSES: 171 # number of categories in training set
23
+ EMBEDDING_DIM: 768
24
+ EMBED_LAYERS: 2
25
+ COMMON_STRIDE: 4 # not used, hard-coded
26
+ LOSS_WEIGHT: 1.0
27
+ CONVS_DIM: 256
28
+ MASK_DIM: 256
29
+ NORM: "GN"
30
+ MASK_FORMER:
31
+ TRANSFORMER_IN_FEATURE: "res5"
32
+ DEEP_SUPERVISION: True
33
+ NO_OBJECT_WEIGHT: 0.1
34
+ DICE_WEIGHT: 1.0
35
+ MASK_WEIGHT: 20.0
36
+ HIDDEN_DIM: 256
37
+ NUM_OBJECT_QUERIES: 100
38
+ NHEADS: 8
39
+ DROPOUT: 0.1
40
+ DIM_FEEDFORWARD: 2048
41
+ ENC_LAYERS: 0
42
+ DEC_LAYERS: 6
43
+ PRE_NORM: False
44
+ CLIP_ADAPTER:
45
+ TEXT_TEMPLATES: "vild"
46
+ CLIP_MODEL_NAME: "ViT-L/14"
47
+ MASK_FILL: "mean"
48
+ MASK_EXPAND_RATIO: 1.0
49
+ MASK_THR: 0.4 # choose the foreground objects
50
+ MASK_MATTING: False # use soft background, default not used
51
+ MASK_PROMPT_DEPTH: 3
52
+ MASK_PROMPT_FWD: True # use mask prompt during forward
53
+ REGION_RESIZED: True # resize to the input of clip, e.g., 224
54
+ CLIP_ENSEMBLE: True # use ensemble of two classification branches
55
+ CLIP_ENSEMBLE_WEIGHT: 0.7
56
+ DATASETS:
57
+ TRAIN: ("coco_2017_train_stuff_sem_seg",)
58
+ TEST: ("ade20k_sem_seg_val",)
59
+ SOLVER:
60
+ IMS_PER_BATCH: 32
61
+ BASE_LR: 0.00006
62
+ MAX_ITER: 120000
63
+ WARMUP_FACTOR: 1e-6
64
+ WARMUP_ITERS: 1500
65
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
66
+ WEIGHT_DECAY: 0.01
67
+ WEIGHT_DECAY_NORM: 0.0
68
+ WEIGHT_DECAY_EMBED: 0.0
69
+ BACKBONE_MULTIPLIER: 1.0
70
+ TEST_IMS_PER_BATCH: 1
71
+ CLIP_GRADIENTS:
72
+ ENABLED: True
73
+ CLIP_TYPE: "full_model"
74
+ CLIP_VALUE: 0.01
75
+ NORM_TYPE: 2.0
76
+ INPUT:
77
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
78
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
79
+ MIN_SIZE_TEST: 640
80
+ MAX_SIZE_TRAIN: 2560
81
+ MAX_SIZE_TEST: 2560
82
+ CROP:
83
+ ENABLED: True
84
+ TYPE: "absolute"
85
+ SIZE: (640, 640)
86
+ SINGLE_CATEGORY_MAX_AREA: 1.0
87
+ COLOR_AUG_SSD: True
88
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
89
+ FORMAT: "RGB"
90
+ TEST:
91
+ EVAL_PERIOD: 5000
92
+ AUG:
93
+ ENABLED: False
94
+ MIN_SIZES: [256, 384, 512, 640, 768, 896]
95
+ MAX_SIZE: 3584
96
+ FLIP: True
97
+ DATALOADER:
98
+ FILTER_EMPTY_ANNOTATIONS: True
99
+ NUM_WORKERS: 4
100
+ VERSION: 2
configs/ovseg_swinB_vitL_demo.yaml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "OVSegDEMO"
3
+ BACKBONE:
4
+ FREEZE_AT: 0
5
+ NAME: "D2SwinTransformer"
6
+ SWIN:
7
+ EMBED_DIM: 128
8
+ DEPTHS: [2, 2, 18, 2]
9
+ NUM_HEADS: [4, 8, 16, 32]
10
+ WINDOW_SIZE: 12
11
+ APE: False
12
+ DROP_PATH_RATE: 0.3
13
+ PATCH_NORM: True
14
+ PRETRAIN_IMG_SIZE: 384
15
+ WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
16
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
17
+ PIXEL_STD: [58.395, 57.120, 57.375]
18
+ SEM_SEG_HEAD:
19
+ NAME: "OpenVocabMaskFormerHead"
20
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
21
+ IGNORE_VALUE: 255
22
+ NUM_CLASSES: 171 # number of categories in training set
23
+ EMBEDDING_DIM: 768
24
+ EMBED_LAYERS: 2
25
+ COMMON_STRIDE: 4 # not used, hard-coded
26
+ LOSS_WEIGHT: 1.0
27
+ CONVS_DIM: 256
28
+ MASK_DIM: 256
29
+ NORM: "GN"
30
+ MASK_FORMER:
31
+ TRANSFORMER_IN_FEATURE: "res5"
32
+ DEEP_SUPERVISION: True
33
+ NO_OBJECT_WEIGHT: 0.1
34
+ DICE_WEIGHT: 1.0
35
+ MASK_WEIGHT: 20.0
36
+ HIDDEN_DIM: 256
37
+ NUM_OBJECT_QUERIES: 100
38
+ NHEADS: 8
39
+ DROPOUT: 0.1
40
+ DIM_FEEDFORWARD: 2048
41
+ ENC_LAYERS: 0
42
+ DEC_LAYERS: 6
43
+ PRE_NORM: False
44
+ CLIP_ADAPTER:
45
+ TEXT_TEMPLATES: "vild"
46
+ CLIP_MODEL_NAME: "ViT-L/14"
47
+ MASK_FILL: "mean"
48
+ MASK_EXPAND_RATIO: 1.0
49
+ MASK_THR: 0.1 # choose the foreground objects
50
+ MASK_MATTING: False # use soft background, default not used
51
+ MASK_PROMPT_DEPTH: 3
52
+ MASK_PROMPT_FWD: True # use mask prompt during forward
53
+ REGION_RESIZED: True # resize to the input of clip, e.g., 224
54
+ CLIP_ENSEMBLE: True # use ensemble of two classification branches
55
+ CLIP_ENSEMBLE_WEIGHT: 0.0
56
+ DATASETS:
57
+ TRAIN: ("coco_2017_train_stuff_sem_seg",)
58
+ TEST: ("ade20k_sem_seg_val",)
59
+ SOLVER:
60
+ IMS_PER_BATCH: 32
61
+ BASE_LR: 0.00006
62
+ MAX_ITER: 120000
63
+ WARMUP_FACTOR: 1e-6
64
+ WARMUP_ITERS: 1500
65
+ WEIGHT_DECAY: 0.01
66
+ WEIGHT_DECAY_NORM: 0.0
67
+ WEIGHT_DECAY_EMBED: 0.0
68
+ BACKBONE_MULTIPLIER: 1.0
69
+ TEST_IMS_PER_BATCH: 1
70
+ CLIP_GRADIENTS:
71
+ ENABLED: True
72
+ CLIP_TYPE: "full_model"
73
+ CLIP_VALUE: 0.01
74
+ NORM_TYPE: 2.0
75
+ INPUT:
76
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
77
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
78
+ MIN_SIZE_TEST: 640
79
+ MAX_SIZE_TRAIN: 2560
80
+ MAX_SIZE_TEST: 2560
81
+ CROP:
82
+ ENABLED: True
83
+ TYPE: "absolute"
84
+ SIZE: (640, 640)
85
+ SINGLE_CATEGORY_MAX_AREA: 1.0
86
+ COLOR_AUG_SSD: True
87
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
88
+ FORMAT: "RGB"
89
+ TEST:
90
+ EVAL_PERIOD: 5000
91
+ AUG:
92
+ ENABLED: False
93
+ MIN_SIZES: [256, 384, 512, 640, 768, 896]
94
+ MAX_SIZE: 3584
95
+ FLIP: True
96
+ DATALOADER:
97
+ FILTER_EMPTY_ANNOTATIONS: True
98
+ NUM_WORKERS: 4
99
+ VERSION: 2
datasets/DATASETS.md ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Prepare Datasets for OVSeg
2
+
3
+ This doc is a modification/extension of [MaskFormer](https://github.com/facebookresearch/MaskFormer/blob/main/datasets/README.md) following [Detectron2 fromat](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html).
4
+
5
+ A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
6
+ for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
7
+ This document explains how to setup the builtin datasets so they can be used by the above APIs.
8
+ [Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
9
+ and how to add new datasets to them.
10
+
11
+ OVSeg has builtin support for a few datasets.
12
+ The datasets are assumed to exist in a directory specified by the environment variable
13
+ `DETECTRON2_DATASETS`.
14
+ Under this directory, detectron2 will look for datasets in the structure described below, if needed.
15
+ ```
16
+ $DETECTRON2_DATASETS/
17
+ coco/ # COCOStuff-171
18
+ ADEChallengeData2016/ # ADE20K-150
19
+ ADE20K_2021_17_01/ # ADE20K-847
20
+ VOCdevkit/
21
+ VOC2012/ # PASCALVOC-20
22
+ VOC2010/ # PASCALContext-59, PASCALContext-459
23
+ ```
24
+
25
+ You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
26
+ If left unset, the default is `./datasets` relative to your current working directory.
27
+
28
+ Without specific notifications, our model is trained on COCOStuff-171 and evlauted on ADE20K-150, ADE20K-847, PASCALVOC-20, PASCALContext-59 and PASCALContext-459.
29
+
30
+ | dataset | split | # images | # categories |
31
+ |:--------------:|:---------:|:--------:|:------------:|
32
+ | COCO Stuff | train2017 | 118K | 171 |
33
+ | ADE20K | val | 2K | 150/847 |
34
+ | Pascal VOC | val | 1.5K | 20 |
35
+ | Pascal Context | val | 5K | 59/459 |
36
+
37
+
38
+ ### Expected dataset structure for [COCO Stuff](https://github.com/nightrome/cocostuff):
39
+ ```
40
+ coco/
41
+ train2017/ # http://images.cocodataset.org/zips/train2017.zip
42
+ annotations/ # http://images.cocodataset.org/annotations/annotations_trainval2017.zip
43
+ stuffthingmaps/
44
+ stuffthingmaps_trainval2017.zip # http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip
45
+ train2017/
46
+ # below are generated
47
+ stuffthingmaps_detectron2/
48
+ train2017/
49
+ ```
50
+
51
+ The directory `stuffthingmaps_detectron2` is generated by running `python datasets/prepare_coco_stuff_sem_seg.py`.
52
+
53
+
54
+
55
+ ### Expected dataset structure for [ADE20k Scene Parsing (ADE20K-150)](http://sceneparsing.csail.mit.edu/):
56
+ ```
57
+ ADEChallengeData2016/
58
+ annotations/
59
+ images/
60
+ objectInfo150.txt
61
+ # below are generated
62
+ annotations_detectron2/
63
+ ```
64
+ The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`.
65
+
66
+
67
+ ### Expected dataset structure for [ADE20k-Full (ADE20K-847)](https://github.com/CSAILVision/ADE20K#download):
68
+ ```
69
+ ADE20K_2021_17_01/
70
+ images/
71
+ index_ade20k.pkl
72
+ objects.txt
73
+ # below are generated
74
+ images_detectron2/
75
+ annotations_detectron2/
76
+ ```
77
+ The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/prepare_ade20k_full_sem_seg.py`.
78
+
79
+ ### Expected dataset structure for [Pascal VOC 2012 (PASCALVOC-20)](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/#devkit):
80
+ ```
81
+ VOCdevkit/VOC2012/
82
+ Annotations/
83
+ ImageSets/
84
+ JPEGImages/
85
+ SegmentationClass/
86
+ SegmentationObject/
87
+ SegmentationClassAug/ # https://github.com/kazuto1011/deeplab-pytorch/blob/master/data/datasets/voc12/README.md
88
+ # below are generated
89
+ images_detectron2/
90
+ annotations_detectron2/
91
+ ```
92
+
93
+ It starts with a tar file `VOCtrainval_11-May-2012.tar`.
94
+
95
+ We use SBD augmentated training data as `SegmentationClassAug` following [Deeplab](https://github.com/kazuto1011/deeplab-pytorch/blob/master/data/datasets/voc12/README.md)
96
+
97
+ The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/prepare_voc_sem_seg.py`.
98
+
99
+
100
+ ### Expected dataset structure for [Pascal Context](https://www.cs.stanford.edu/~roozbeh/pascal-context/):
101
+
102
+ ```
103
+ VOCdevkit/VOC2010/
104
+ Annotations/
105
+ ImageSets/
106
+ JPEGImages/
107
+ SegmentationClass/
108
+ SegmentationObject/
109
+ # below are from https://www.cs.stanford.edu/~roozbeh/pascal-context/trainval.tar.gz
110
+ trainval/
111
+ labels.txt
112
+ 59_labels.txt # https://www.cs.stanford.edu/~roozbeh/pascal-context/59_labels.txt
113
+ pascalcontext_val.txt # https://drive.google.com/file/d/1BCbiOKtLvozjVnlTJX51koIveUZHCcUh/view?usp=sharing
114
+ # below are generated
115
+ annotations_detectron2/
116
+ pc459_val
117
+ pc59_val
118
+ ```
119
+ It starts with a tar file `VOCtrainval_03-May-2010.tar`. You may want to download the 5K validation set [here](https://drive.google.com/file/d/1BCbiOKtLvozjVnlTJX51koIveUZHCcUh/view?usp=sharing).
120
+
121
+ The directory `annotations_detectron2` is generated by running `python datasets/prepare_pascal_context.py`.
122
+
datasets/prepare_ade20k_full_sem_seg.py ADDED
@@ -0,0 +1,1011 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ import os
5
+ import pickle as pkl
6
+ from pathlib import Path
7
+
8
+ import cv2
9
+ import numpy as np
10
+ import tqdm
11
+ from PIL import Image
12
+
13
+ ADE20K_SEM_SEG_FULL_CATEGORIES = [
14
+ {"name": "wall", "id": 2978, "trainId": 0},
15
+ {"name": "building, edifice", "id": 312, "trainId": 1},
16
+ {"name": "sky", "id": 2420, "trainId": 2},
17
+ {"name": "tree", "id": 2855, "trainId": 3},
18
+ {"name": "road, route", "id": 2131, "trainId": 4},
19
+ {"name": "floor, flooring", "id": 976, "trainId": 5},
20
+ {"name": "ceiling", "id": 447, "trainId": 6},
21
+ {"name": "bed", "id": 165, "trainId": 7},
22
+ {"name": "sidewalk, pavement", "id": 2377, "trainId": 8},
23
+ {"name": "earth, ground", "id": 838, "trainId": 9},
24
+ {"name": "cabinet", "id": 350, "trainId": 10},
25
+ {"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11},
26
+ {"name": "grass", "id": 1125, "trainId": 12},
27
+ {"name": "windowpane, window", "id": 3055, "trainId": 13},
28
+ {"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14},
29
+ {"name": "mountain, mount", "id": 1610, "trainId": 15},
30
+ {"name": "plant, flora, plant life", "id": 1910, "trainId": 16},
31
+ {"name": "table", "id": 2684, "trainId": 17},
32
+ {"name": "chair", "id": 471, "trainId": 18},
33
+ {"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19},
34
+ {"name": "door", "id": 774, "trainId": 20},
35
+ {"name": "sofa, couch, lounge", "id": 2473, "trainId": 21},
36
+ {"name": "sea", "id": 2264, "trainId": 22},
37
+ {"name": "painting, picture", "id": 1735, "trainId": 23},
38
+ {"name": "water", "id": 2994, "trainId": 24},
39
+ {"name": "mirror", "id": 1564, "trainId": 25},
40
+ {"name": "house", "id": 1276, "trainId": 26},
41
+ {"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27},
42
+ {"name": "shelf", "id": 2329, "trainId": 28},
43
+ {"name": "armchair", "id": 57, "trainId": 29},
44
+ {"name": "fence, fencing", "id": 907, "trainId": 30},
45
+ {"name": "field", "id": 913, "trainId": 31},
46
+ {"name": "lamp", "id": 1395, "trainId": 32},
47
+ {"name": "rock, stone", "id": 2138, "trainId": 33},
48
+ {"name": "seat", "id": 2272, "trainId": 34},
49
+ {"name": "river", "id": 2128, "trainId": 35},
50
+ {"name": "desk", "id": 724, "trainId": 36},
51
+ {"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37},
52
+ {"name": "railing, rail", "id": 2053, "trainId": 38},
53
+ {"name": "signboard, sign", "id": 2380, "trainId": 39},
54
+ {"name": "cushion", "id": 689, "trainId": 40},
55
+ {"name": "path", "id": 1788, "trainId": 41},
56
+ {"name": "work surface", "id": 3087, "trainId": 42},
57
+ {"name": "stairs, steps", "id": 2530, "trainId": 43},
58
+ {"name": "column, pillar", "id": 581, "trainId": 44},
59
+ {"name": "sink", "id": 2388, "trainId": 45},
60
+ {"name": "wardrobe, closet, press", "id": 2985, "trainId": 46},
61
+ {"name": "snow", "id": 2454, "trainId": 47},
62
+ {"name": "refrigerator, icebox", "id": 2096, "trainId": 48},
63
+ {"name": "base, pedestal, stand", "id": 137, "trainId": 49},
64
+ {"name": "bridge, span", "id": 294, "trainId": 50},
65
+ {"name": "blind, screen", "id": 212, "trainId": 51},
66
+ {"name": "runway", "id": 2185, "trainId": 52},
67
+ {"name": "cliff, drop, drop-off", "id": 524, "trainId": 53},
68
+ {"name": "sand", "id": 2212, "trainId": 54},
69
+ {"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55},
70
+ {"name": "pillow", "id": 1869, "trainId": 56},
71
+ {"name": "screen door, screen", "id": 2251, "trainId": 57},
72
+ {"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58},
73
+ {"name": "skyscraper", "id": 2423, "trainId": 59},
74
+ {"name": "grandstand, covered stand", "id": 1121, "trainId": 60},
75
+ {"name": "box", "id": 266, "trainId": 61},
76
+ {"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62},
77
+ {"name": "palm, palm tree", "id": 1744, "trainId": 63},
78
+ {"name": "double door", "id": 783, "trainId": 64},
79
+ {"name": "coffee table, cocktail table", "id": 571, "trainId": 65},
80
+ {"name": "counter", "id": 627, "trainId": 66},
81
+ {"name": "countertop", "id": 629, "trainId": 67},
82
+ {"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68},
83
+ {"name": "kitchen island", "id": 1374, "trainId": 69},
84
+ {"name": "boat", "id": 223, "trainId": 70},
85
+ {"name": "waterfall, falls", "id": 3016, "trainId": 71},
86
+ {
87
+ "name": "stove, kitchen stove, range, kitchen range, cooking stove",
88
+ "id": 2598,
89
+ "trainId": 72,
90
+ },
91
+ {"name": "flower", "id": 978, "trainId": 73},
92
+ {"name": "bookcase", "id": 239, "trainId": 74},
93
+ {"name": "controls", "id": 608, "trainId": 75},
94
+ {"name": "book", "id": 236, "trainId": 76},
95
+ {"name": "stairway, staircase", "id": 2531, "trainId": 77},
96
+ {"name": "streetlight, street lamp", "id": 2616, "trainId": 78},
97
+ {
98
+ "name": "computer, computing machine, computing device, data processor, electronic computer, information processing system",
99
+ "id": 591,
100
+ "trainId": 79,
101
+ },
102
+ {
103
+ "name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle",
104
+ "id": 327,
105
+ "trainId": 80,
106
+ },
107
+ {"name": "swivel chair", "id": 2679, "trainId": 81},
108
+ {"name": "light, light source", "id": 1451, "trainId": 82},
109
+ {"name": "bench", "id": 181, "trainId": 83},
110
+ {"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84},
111
+ {"name": "towel", "id": 2821, "trainId": 85},
112
+ {"name": "fountain", "id": 1023, "trainId": 86},
113
+ {"name": "embankment", "id": 855, "trainId": 87},
114
+ {
115
+ "name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box",
116
+ "id": 2733,
117
+ "trainId": 88,
118
+ },
119
+ {"name": "van", "id": 2928, "trainId": 89},
120
+ {"name": "hill", "id": 1240, "trainId": 90},
121
+ {"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91},
122
+ {"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92},
123
+ {"name": "truck, motortruck", "id": 2880, "trainId": 93},
124
+ {"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94},
125
+ {"name": "pole", "id": 1936, "trainId": 95},
126
+ {"name": "tower", "id": 2828, "trainId": 96},
127
+ {"name": "court", "id": 631, "trainId": 97},
128
+ {"name": "ball", "id": 103, "trainId": 98},
129
+ {
130
+ "name": "aircraft carrier, carrier, flattop, attack aircraft carrier",
131
+ "id": 3144,
132
+ "trainId": 99,
133
+ },
134
+ {"name": "buffet, counter, sideboard", "id": 308, "trainId": 100},
135
+ {"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101},
136
+ {"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102},
137
+ {"name": "minibike, motorbike", "id": 1563, "trainId": 103},
138
+ {"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104},
139
+ {"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105},
140
+ {"name": "step, stair", "id": 2569, "trainId": 106},
141
+ {"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107},
142
+ {"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108},
143
+ {"name": "doorframe, doorcase", "id": 778, "trainId": 109},
144
+ {"name": "sconce", "id": 2243, "trainId": 110},
145
+ {"name": "pond", "id": 1941, "trainId": 111},
146
+ {"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112},
147
+ {"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113},
148
+ {"name": "bag", "id": 95, "trainId": 114},
149
+ {"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115},
150
+ {"name": "gazebo", "id": 1087, "trainId": 116},
151
+ {"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117},
152
+ {"name": "land, ground, soil", "id": 1401, "trainId": 118},
153
+ {"name": "board, plank", "id": 220, "trainId": 119},
154
+ {"name": "arcade machine", "id": 47, "trainId": 120},
155
+ {"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121},
156
+ {"name": "bar", "id": 123, "trainId": 122},
157
+ {"name": "stall, stand, sales booth", "id": 2537, "trainId": 123},
158
+ {"name": "playground", "id": 1927, "trainId": 124},
159
+ {"name": "ship", "id": 2337, "trainId": 125},
160
+ {"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126},
161
+ {
162
+ "name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
163
+ "id": 64,
164
+ "trainId": 127,
165
+ },
166
+ {"name": "bottle", "id": 249, "trainId": 128},
167
+ {"name": "cradle", "id": 642, "trainId": 129},
168
+ {"name": "pot, flowerpot", "id": 1981, "trainId": 130},
169
+ {
170
+ "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
171
+ "id": 609,
172
+ "trainId": 131,
173
+ },
174
+ {"name": "train, railroad train", "id": 2840, "trainId": 132},
175
+ {"name": "stool", "id": 2586, "trainId": 133},
176
+ {"name": "lake", "id": 1393, "trainId": 134},
177
+ {"name": "tank, storage tank", "id": 2704, "trainId": 135},
178
+ {"name": "ice, water ice", "id": 1304, "trainId": 136},
179
+ {"name": "basket, handbasket", "id": 146, "trainId": 137},
180
+ {"name": "manhole", "id": 1494, "trainId": 138},
181
+ {"name": "tent, collapsible shelter", "id": 2739, "trainId": 139},
182
+ {"name": "canopy", "id": 389, "trainId": 140},
183
+ {"name": "microwave, microwave oven", "id": 1551, "trainId": 141},
184
+ {"name": "barrel, cask", "id": 131, "trainId": 142},
185
+ {"name": "dirt track", "id": 738, "trainId": 143},
186
+ {"name": "beam", "id": 161, "trainId": 144},
187
+ {"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145},
188
+ {"name": "plate", "id": 1919, "trainId": 146},
189
+ {"name": "screen, crt screen", "id": 3109, "trainId": 147},
190
+ {"name": "ruins", "id": 2179, "trainId": 148},
191
+ {"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149},
192
+ {"name": "blanket, cover", "id": 206, "trainId": 150},
193
+ {"name": "plaything, toy", "id": 1930, "trainId": 151},
194
+ {"name": "food, solid food", "id": 1002, "trainId": 152},
195
+ {"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153},
196
+ {"name": "oven", "id": 1708, "trainId": 154},
197
+ {"name": "stage", "id": 2526, "trainId": 155},
198
+ {"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156},
199
+ {"name": "umbrella", "id": 2901, "trainId": 157},
200
+ {"name": "sculpture", "id": 2262, "trainId": 158},
201
+ {"name": "aqueduct", "id": 44, "trainId": 159},
202
+ {"name": "container", "id": 597, "trainId": 160},
203
+ {"name": "scaffolding, staging", "id": 2235, "trainId": 161},
204
+ {"name": "hood, exhaust hood", "id": 1260, "trainId": 162},
205
+ {"name": "curb, curbing, kerb", "id": 682, "trainId": 163},
206
+ {"name": "roller coaster", "id": 2151, "trainId": 164},
207
+ {"name": "horse, equus caballus", "id": 3107, "trainId": 165},
208
+ {"name": "catwalk", "id": 432, "trainId": 166},
209
+ {"name": "glass, drinking glass", "id": 1098, "trainId": 167},
210
+ {"name": "vase", "id": 2932, "trainId": 168},
211
+ {"name": "central reservation", "id": 461, "trainId": 169},
212
+ {"name": "carousel", "id": 410, "trainId": 170},
213
+ {"name": "radiator", "id": 2046, "trainId": 171},
214
+ {"name": "closet", "id": 533, "trainId": 172},
215
+ {"name": "machine", "id": 1481, "trainId": 173},
216
+ {"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174},
217
+ {"name": "fan", "id": 894, "trainId": 175},
218
+ {"name": "inflatable bounce game", "id": 1322, "trainId": 176},
219
+ {"name": "pitch", "id": 1891, "trainId": 177},
220
+ {"name": "paper", "id": 1756, "trainId": 178},
221
+ {"name": "arcade, colonnade", "id": 49, "trainId": 179},
222
+ {"name": "hot tub", "id": 1272, "trainId": 180},
223
+ {"name": "helicopter", "id": 1229, "trainId": 181},
224
+ {"name": "tray", "id": 2850, "trainId": 182},
225
+ {"name": "partition, divider", "id": 1784, "trainId": 183},
226
+ {"name": "vineyard", "id": 2962, "trainId": 184},
227
+ {"name": "bowl", "id": 259, "trainId": 185},
228
+ {"name": "bullring", "id": 319, "trainId": 186},
229
+ {"name": "flag", "id": 954, "trainId": 187},
230
+ {"name": "pot", "id": 1974, "trainId": 188},
231
+ {"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189},
232
+ {"name": "shower", "id": 2356, "trainId": 190},
233
+ {"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191},
234
+ {"name": "bulletin board, notice board", "id": 318, "trainId": 192},
235
+ {"name": "confessional booth", "id": 592, "trainId": 193},
236
+ {"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194},
237
+ {"name": "forest", "id": 1017, "trainId": 195},
238
+ {"name": "elevator door", "id": 851, "trainId": 196},
239
+ {"name": "laptop, laptop computer", "id": 1407, "trainId": 197},
240
+ {"name": "instrument panel", "id": 1332, "trainId": 198},
241
+ {"name": "bucket, pail", "id": 303, "trainId": 199},
242
+ {"name": "tapestry, tapis", "id": 2714, "trainId": 200},
243
+ {"name": "platform", "id": 1924, "trainId": 201},
244
+ {"name": "jacket", "id": 1346, "trainId": 202},
245
+ {"name": "gate", "id": 1081, "trainId": 203},
246
+ {"name": "monitor, monitoring device", "id": 1583, "trainId": 204},
247
+ {
248
+ "name": "telephone booth, phone booth, call box, telephone box, telephone kiosk",
249
+ "id": 2727,
250
+ "trainId": 205,
251
+ },
252
+ {"name": "spotlight, spot", "id": 2509, "trainId": 206},
253
+ {"name": "ring", "id": 2123, "trainId": 207},
254
+ {"name": "control panel", "id": 602, "trainId": 208},
255
+ {"name": "blackboard, chalkboard", "id": 202, "trainId": 209},
256
+ {"name": "air conditioner, air conditioning", "id": 10, "trainId": 210},
257
+ {"name": "chest", "id": 490, "trainId": 211},
258
+ {"name": "clock", "id": 530, "trainId": 212},
259
+ {"name": "sand dune", "id": 2213, "trainId": 213},
260
+ {"name": "pipe, pipage, piping", "id": 1884, "trainId": 214},
261
+ {"name": "vault", "id": 2934, "trainId": 215},
262
+ {"name": "table football", "id": 2687, "trainId": 216},
263
+ {"name": "cannon", "id": 387, "trainId": 217},
264
+ {"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218},
265
+ {"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219},
266
+ {"name": "statue", "id": 2547, "trainId": 220},
267
+ {
268
+ "name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
269
+ "id": 1474,
270
+ "trainId": 221,
271
+ },
272
+ {"name": "exhibitor", "id": 877, "trainId": 222},
273
+ {"name": "ladder", "id": 1391, "trainId": 223},
274
+ {"name": "carport", "id": 414, "trainId": 224},
275
+ {"name": "dam", "id": 698, "trainId": 225},
276
+ {"name": "pulpit", "id": 2019, "trainId": 226},
277
+ {"name": "skylight, fanlight", "id": 2422, "trainId": 227},
278
+ {"name": "water tower", "id": 3010, "trainId": 228},
279
+ {"name": "grill, grille, grillwork", "id": 1139, "trainId": 229},
280
+ {"name": "display board", "id": 753, "trainId": 230},
281
+ {"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231},
282
+ {"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232},
283
+ {"name": "ice rink", "id": 1301, "trainId": 233},
284
+ {"name": "fruit", "id": 1033, "trainId": 234},
285
+ {"name": "patio", "id": 1789, "trainId": 235},
286
+ {"name": "vending machine", "id": 2939, "trainId": 236},
287
+ {"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237},
288
+ {"name": "net", "id": 1652, "trainId": 238},
289
+ {
290
+ "name": "backpack, back pack, knapsack, packsack, rucksack, haversack",
291
+ "id": 90,
292
+ "trainId": 239,
293
+ },
294
+ {"name": "jar", "id": 1349, "trainId": 240},
295
+ {"name": "track", "id": 2830, "trainId": 241},
296
+ {"name": "magazine", "id": 1485, "trainId": 242},
297
+ {"name": "shutter", "id": 2370, "trainId": 243},
298
+ {"name": "roof", "id": 2155, "trainId": 244},
299
+ {"name": "banner, streamer", "id": 118, "trainId": 245},
300
+ {"name": "landfill", "id": 1402, "trainId": 246},
301
+ {"name": "post", "id": 1957, "trainId": 247},
302
+ {"name": "altarpiece, reredos", "id": 3130, "trainId": 248},
303
+ {"name": "hat, chapeau, lid", "id": 1197, "trainId": 249},
304
+ {"name": "arch, archway", "id": 52, "trainId": 250},
305
+ {"name": "table game", "id": 2688, "trainId": 251},
306
+ {"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252},
307
+ {"name": "document, written document, papers", "id": 762, "trainId": 253},
308
+ {"name": "dome", "id": 772, "trainId": 254},
309
+ {"name": "pier", "id": 1857, "trainId": 255},
310
+ {"name": "shanties", "id": 2315, "trainId": 256},
311
+ {"name": "forecourt", "id": 1016, "trainId": 257},
312
+ {"name": "crane", "id": 643, "trainId": 258},
313
+ {"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259},
314
+ {"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260},
315
+ {"name": "drawing", "id": 791, "trainId": 261},
316
+ {"name": "cabin", "id": 349, "trainId": 262},
317
+ {
318
+ "name": "ad, advertisement, advertizement, advertising, advertizing, advert",
319
+ "id": 6,
320
+ "trainId": 263,
321
+ },
322
+ {"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264},
323
+ {"name": "monument", "id": 1587, "trainId": 265},
324
+ {"name": "henhouse", "id": 1233, "trainId": 266},
325
+ {"name": "cockpit", "id": 559, "trainId": 267},
326
+ {"name": "heater, warmer", "id": 1223, "trainId": 268},
327
+ {"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269},
328
+ {"name": "pool", "id": 1943, "trainId": 270},
329
+ {"name": "elevator, lift", "id": 853, "trainId": 271},
330
+ {"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272},
331
+ {"name": "labyrinth", "id": 1390, "trainId": 273},
332
+ {"name": "text, textual matter", "id": 2748, "trainId": 274},
333
+ {"name": "printer", "id": 2007, "trainId": 275},
334
+ {"name": "mezzanine, first balcony", "id": 1546, "trainId": 276},
335
+ {"name": "mattress", "id": 1513, "trainId": 277},
336
+ {"name": "straw", "id": 2600, "trainId": 278},
337
+ {"name": "stalls", "id": 2538, "trainId": 279},
338
+ {"name": "patio, terrace", "id": 1790, "trainId": 280},
339
+ {"name": "billboard, hoarding", "id": 194, "trainId": 281},
340
+ {"name": "bus stop", "id": 326, "trainId": 282},
341
+ {"name": "trouser, pant", "id": 2877, "trainId": 283},
342
+ {"name": "console table, console", "id": 594, "trainId": 284},
343
+ {"name": "rack", "id": 2036, "trainId": 285},
344
+ {"name": "notebook", "id": 1662, "trainId": 286},
345
+ {"name": "shrine", "id": 2366, "trainId": 287},
346
+ {"name": "pantry", "id": 1754, "trainId": 288},
347
+ {"name": "cart", "id": 418, "trainId": 289},
348
+ {"name": "steam shovel", "id": 2553, "trainId": 290},
349
+ {"name": "porch", "id": 1951, "trainId": 291},
350
+ {"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292},
351
+ {"name": "figurine, statuette", "id": 918, "trainId": 293},
352
+ {"name": "recycling bin", "id": 2086, "trainId": 294},
353
+ {"name": "folding screen", "id": 997, "trainId": 295},
354
+ {"name": "telescope", "id": 2731, "trainId": 296},
355
+ {"name": "deck chair, beach chair", "id": 704, "trainId": 297},
356
+ {"name": "kennel", "id": 1365, "trainId": 298},
357
+ {"name": "coffee maker", "id": 569, "trainId": 299},
358
+ {"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300},
359
+ {"name": "fish", "id": 948, "trainId": 301},
360
+ {"name": "easel", "id": 839, "trainId": 302},
361
+ {"name": "artificial golf green", "id": 63, "trainId": 303},
362
+ {"name": "iceberg", "id": 1305, "trainId": 304},
363
+ {"name": "candlestick, candle holder", "id": 378, "trainId": 305},
364
+ {"name": "shower stall, shower bath", "id": 2362, "trainId": 306},
365
+ {"name": "television stand", "id": 2734, "trainId": 307},
366
+ {
367
+ "name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle",
368
+ "id": 2982,
369
+ "trainId": 308,
370
+ },
371
+ {"name": "skeleton", "id": 2398, "trainId": 309},
372
+ {"name": "grand piano, grand", "id": 1119, "trainId": 310},
373
+ {"name": "candy, confect", "id": 382, "trainId": 311},
374
+ {"name": "grille door", "id": 1141, "trainId": 312},
375
+ {"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313},
376
+ {"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314},
377
+ {"name": "shoe", "id": 2341, "trainId": 315},
378
+ {"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316},
379
+ {"name": "shanty", "id": 2316, "trainId": 317},
380
+ {"name": "structure", "id": 2626, "trainId": 318},
381
+ {"name": "rocking chair, rocker", "id": 3104, "trainId": 319},
382
+ {"name": "bird", "id": 198, "trainId": 320},
383
+ {"name": "place mat", "id": 1896, "trainId": 321},
384
+ {"name": "tomb", "id": 2800, "trainId": 322},
385
+ {"name": "big top", "id": 190, "trainId": 323},
386
+ {"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324},
387
+ {"name": "lockers", "id": 1463, "trainId": 325},
388
+ {"name": "cage", "id": 357, "trainId": 326},
389
+ {"name": "finger", "id": 929, "trainId": 327},
390
+ {"name": "bleachers", "id": 209, "trainId": 328},
391
+ {"name": "ferris wheel", "id": 912, "trainId": 329},
392
+ {"name": "hairdresser chair", "id": 1164, "trainId": 330},
393
+ {"name": "mat", "id": 1509, "trainId": 331},
394
+ {"name": "stands", "id": 2539, "trainId": 332},
395
+ {"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333},
396
+ {"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334},
397
+ {"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335},
398
+ {"name": "dummy", "id": 818, "trainId": 336},
399
+ {"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337},
400
+ {"name": "sand trap", "id": 2217, "trainId": 338},
401
+ {"name": "shop, store", "id": 2347, "trainId": 339},
402
+ {"name": "table cloth", "id": 2686, "trainId": 340},
403
+ {"name": "service station", "id": 2300, "trainId": 341},
404
+ {"name": "coffin", "id": 572, "trainId": 342},
405
+ {"name": "drawer", "id": 789, "trainId": 343},
406
+ {"name": "cages", "id": 358, "trainId": 344},
407
+ {"name": "slot machine, coin machine", "id": 2443, "trainId": 345},
408
+ {"name": "balcony", "id": 101, "trainId": 346},
409
+ {"name": "volleyball court", "id": 2969, "trainId": 347},
410
+ {"name": "table tennis", "id": 2692, "trainId": 348},
411
+ {"name": "control table", "id": 606, "trainId": 349},
412
+ {"name": "shirt", "id": 2339, "trainId": 350},
413
+ {"name": "merchandise, ware, product", "id": 1533, "trainId": 351},
414
+ {"name": "railway", "id": 2060, "trainId": 352},
415
+ {"name": "parterre", "id": 1782, "trainId": 353},
416
+ {"name": "chimney", "id": 495, "trainId": 354},
417
+ {"name": "can, tin, tin can", "id": 371, "trainId": 355},
418
+ {"name": "tanks", "id": 2707, "trainId": 356},
419
+ {"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357},
420
+ {"name": "alga, algae", "id": 3156, "trainId": 358},
421
+ {"name": "system", "id": 2683, "trainId": 359},
422
+ {"name": "map", "id": 1499, "trainId": 360},
423
+ {"name": "greenhouse", "id": 1135, "trainId": 361},
424
+ {"name": "mug", "id": 1619, "trainId": 362},
425
+ {"name": "barbecue", "id": 125, "trainId": 363},
426
+ {"name": "trailer", "id": 2838, "trainId": 364},
427
+ {"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365},
428
+ {"name": "organ", "id": 1695, "trainId": 366},
429
+ {"name": "dishrag, dishcloth", "id": 746, "trainId": 367},
430
+ {"name": "island", "id": 1343, "trainId": 368},
431
+ {"name": "keyboard", "id": 1370, "trainId": 369},
432
+ {"name": "trench", "id": 2858, "trainId": 370},
433
+ {"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371},
434
+ {"name": "steering wheel, wheel", "id": 2565, "trainId": 372},
435
+ {"name": "pitcher, ewer", "id": 1892, "trainId": 373},
436
+ {"name": "goal", "id": 1103, "trainId": 374},
437
+ {"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375},
438
+ {"name": "beds", "id": 170, "trainId": 376},
439
+ {"name": "wood", "id": 3073, "trainId": 377},
440
+ {"name": "file cabinet", "id": 922, "trainId": 378},
441
+ {"name": "newspaper, paper", "id": 1655, "trainId": 379},
442
+ {"name": "motorboat", "id": 1602, "trainId": 380},
443
+ {"name": "rope", "id": 2160, "trainId": 381},
444
+ {"name": "guitar", "id": 1151, "trainId": 382},
445
+ {"name": "rubble", "id": 2176, "trainId": 383},
446
+ {"name": "scarf", "id": 2239, "trainId": 384},
447
+ {"name": "barrels", "id": 132, "trainId": 385},
448
+ {"name": "cap", "id": 394, "trainId": 386},
449
+ {"name": "leaves", "id": 1424, "trainId": 387},
450
+ {"name": "control tower", "id": 607, "trainId": 388},
451
+ {"name": "dashboard", "id": 700, "trainId": 389},
452
+ {"name": "bandstand", "id": 116, "trainId": 390},
453
+ {"name": "lectern", "id": 1425, "trainId": 391},
454
+ {"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392},
455
+ {"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393},
456
+ {"name": "shower room", "id": 2360, "trainId": 394},
457
+ {"name": "smoke", "id": 2449, "trainId": 395},
458
+ {"name": "faucet, spigot", "id": 897, "trainId": 396},
459
+ {"name": "bulldozer", "id": 317, "trainId": 397},
460
+ {"name": "saucepan", "id": 2228, "trainId": 398},
461
+ {"name": "shops", "id": 2351, "trainId": 399},
462
+ {"name": "meter", "id": 1543, "trainId": 400},
463
+ {"name": "crevasse", "id": 656, "trainId": 401},
464
+ {"name": "gear", "id": 1088, "trainId": 402},
465
+ {"name": "candelabrum, candelabra", "id": 373, "trainId": 403},
466
+ {"name": "sofa bed", "id": 2472, "trainId": 404},
467
+ {"name": "tunnel", "id": 2892, "trainId": 405},
468
+ {"name": "pallet", "id": 1740, "trainId": 406},
469
+ {"name": "wire, conducting wire", "id": 3067, "trainId": 407},
470
+ {"name": "kettle, boiler", "id": 1367, "trainId": 408},
471
+ {"name": "bidet", "id": 188, "trainId": 409},
472
+ {
473
+ "name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher",
474
+ "id": 79,
475
+ "trainId": 410,
476
+ },
477
+ {"name": "music stand", "id": 1633, "trainId": 411},
478
+ {"name": "pipe, tube", "id": 1885, "trainId": 412},
479
+ {"name": "cup", "id": 677, "trainId": 413},
480
+ {"name": "parking meter", "id": 1779, "trainId": 414},
481
+ {"name": "ice hockey rink", "id": 1297, "trainId": 415},
482
+ {"name": "shelter", "id": 2334, "trainId": 416},
483
+ {"name": "weeds", "id": 3027, "trainId": 417},
484
+ {"name": "temple", "id": 2735, "trainId": 418},
485
+ {"name": "patty, cake", "id": 1791, "trainId": 419},
486
+ {"name": "ski slope", "id": 2405, "trainId": 420},
487
+ {"name": "panel", "id": 1748, "trainId": 421},
488
+ {"name": "wallet", "id": 2983, "trainId": 422},
489
+ {"name": "wheel", "id": 3035, "trainId": 423},
490
+ {"name": "towel rack, towel horse", "id": 2824, "trainId": 424},
491
+ {"name": "roundabout", "id": 2168, "trainId": 425},
492
+ {"name": "canister, cannister, tin", "id": 385, "trainId": 426},
493
+ {"name": "rod", "id": 2148, "trainId": 427},
494
+ {"name": "soap dispenser", "id": 2465, "trainId": 428},
495
+ {"name": "bell", "id": 175, "trainId": 429},
496
+ {"name": "canvas", "id": 390, "trainId": 430},
497
+ {"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431},
498
+ {"name": "teacup", "id": 2722, "trainId": 432},
499
+ {"name": "trellis", "id": 2857, "trainId": 433},
500
+ {"name": "workbench", "id": 3088, "trainId": 434},
501
+ {"name": "valley, vale", "id": 2926, "trainId": 435},
502
+ {"name": "toaster", "id": 2782, "trainId": 436},
503
+ {"name": "knife", "id": 1378, "trainId": 437},
504
+ {"name": "podium", "id": 1934, "trainId": 438},
505
+ {"name": "ramp", "id": 2072, "trainId": 439},
506
+ {"name": "tumble dryer", "id": 2889, "trainId": 440},
507
+ {"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441},
508
+ {"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442},
509
+ {"name": "lab bench", "id": 1383, "trainId": 443},
510
+ {"name": "equipment", "id": 867, "trainId": 444},
511
+ {"name": "rocky formation", "id": 2145, "trainId": 445},
512
+ {"name": "plastic", "id": 1915, "trainId": 446},
513
+ {"name": "calendar", "id": 361, "trainId": 447},
514
+ {"name": "caravan", "id": 402, "trainId": 448},
515
+ {"name": "check-in-desk", "id": 482, "trainId": 449},
516
+ {"name": "ticket counter", "id": 2761, "trainId": 450},
517
+ {"name": "brush", "id": 300, "trainId": 451},
518
+ {"name": "mill", "id": 1554, "trainId": 452},
519
+ {"name": "covered bridge", "id": 636, "trainId": 453},
520
+ {"name": "bowling alley", "id": 260, "trainId": 454},
521
+ {"name": "hanger", "id": 1186, "trainId": 455},
522
+ {"name": "excavator", "id": 871, "trainId": 456},
523
+ {"name": "trestle", "id": 2859, "trainId": 457},
524
+ {"name": "revolving door", "id": 2103, "trainId": 458},
525
+ {"name": "blast furnace", "id": 208, "trainId": 459},
526
+ {"name": "scale, weighing machine", "id": 2236, "trainId": 460},
527
+ {"name": "projector", "id": 2012, "trainId": 461},
528
+ {"name": "soap", "id": 2462, "trainId": 462},
529
+ {"name": "locker", "id": 1462, "trainId": 463},
530
+ {"name": "tractor", "id": 2832, "trainId": 464},
531
+ {"name": "stretcher", "id": 2617, "trainId": 465},
532
+ {"name": "frame", "id": 1024, "trainId": 466},
533
+ {"name": "grating", "id": 1129, "trainId": 467},
534
+ {"name": "alembic", "id": 18, "trainId": 468},
535
+ {"name": "candle, taper, wax light", "id": 376, "trainId": 469},
536
+ {"name": "barrier", "id": 134, "trainId": 470},
537
+ {"name": "cardboard", "id": 407, "trainId": 471},
538
+ {"name": "cave", "id": 434, "trainId": 472},
539
+ {"name": "puddle", "id": 2017, "trainId": 473},
540
+ {"name": "tarp", "id": 2717, "trainId": 474},
541
+ {"name": "price tag", "id": 2005, "trainId": 475},
542
+ {"name": "watchtower", "id": 2993, "trainId": 476},
543
+ {"name": "meters", "id": 1545, "trainId": 477},
544
+ {
545
+ "name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb",
546
+ "id": 1445,
547
+ "trainId": 478,
548
+ },
549
+ {"name": "tracks", "id": 2831, "trainId": 479},
550
+ {"name": "hair dryer", "id": 1161, "trainId": 480},
551
+ {"name": "skirt", "id": 2411, "trainId": 481},
552
+ {"name": "viaduct", "id": 2949, "trainId": 482},
553
+ {"name": "paper towel", "id": 1769, "trainId": 483},
554
+ {"name": "coat", "id": 552, "trainId": 484},
555
+ {"name": "sheet", "id": 2327, "trainId": 485},
556
+ {"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486},
557
+ {"name": "water wheel", "id": 3013, "trainId": 487},
558
+ {"name": "pottery, clayware", "id": 1986, "trainId": 488},
559
+ {"name": "magazine rack", "id": 1486, "trainId": 489},
560
+ {"name": "teapot", "id": 2723, "trainId": 490},
561
+ {"name": "microphone, mike", "id": 1549, "trainId": 491},
562
+ {"name": "support", "id": 2649, "trainId": 492},
563
+ {"name": "forklift", "id": 1020, "trainId": 493},
564
+ {"name": "canyon", "id": 392, "trainId": 494},
565
+ {"name": "cash register, register", "id": 422, "trainId": 495},
566
+ {"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496},
567
+ {"name": "remote control, remote", "id": 2099, "trainId": 497},
568
+ {"name": "soap dish", "id": 2464, "trainId": 498},
569
+ {"name": "windshield, windscreen", "id": 3058, "trainId": 499},
570
+ {"name": "cat", "id": 430, "trainId": 500},
571
+ {"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501},
572
+ {"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502},
573
+ {"name": "videos", "id": 2955, "trainId": 503},
574
+ {"name": "shovel", "id": 2355, "trainId": 504},
575
+ {"name": "eaves", "id": 840, "trainId": 505},
576
+ {"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506},
577
+ {"name": "shipyard", "id": 2338, "trainId": 507},
578
+ {"name": "hen, biddy", "id": 1232, "trainId": 508},
579
+ {"name": "traffic cone", "id": 2834, "trainId": 509},
580
+ {"name": "washing machines", "id": 2991, "trainId": 510},
581
+ {"name": "truck crane", "id": 2879, "trainId": 511},
582
+ {"name": "cds", "id": 444, "trainId": 512},
583
+ {"name": "niche", "id": 1657, "trainId": 513},
584
+ {"name": "scoreboard", "id": 2246, "trainId": 514},
585
+ {"name": "briefcase", "id": 296, "trainId": 515},
586
+ {"name": "boot", "id": 245, "trainId": 516},
587
+ {"name": "sweater, jumper", "id": 2661, "trainId": 517},
588
+ {"name": "hay", "id": 1202, "trainId": 518},
589
+ {"name": "pack", "id": 1714, "trainId": 519},
590
+ {"name": "bottle rack", "id": 251, "trainId": 520},
591
+ {"name": "glacier", "id": 1095, "trainId": 521},
592
+ {"name": "pergola", "id": 1828, "trainId": 522},
593
+ {"name": "building materials", "id": 311, "trainId": 523},
594
+ {"name": "television camera", "id": 2732, "trainId": 524},
595
+ {"name": "first floor", "id": 947, "trainId": 525},
596
+ {"name": "rifle", "id": 2115, "trainId": 526},
597
+ {"name": "tennis table", "id": 2738, "trainId": 527},
598
+ {"name": "stadium", "id": 2525, "trainId": 528},
599
+ {"name": "safety belt", "id": 2194, "trainId": 529},
600
+ {"name": "cover", "id": 634, "trainId": 530},
601
+ {"name": "dish rack", "id": 740, "trainId": 531},
602
+ {"name": "synthesizer", "id": 2682, "trainId": 532},
603
+ {"name": "pumpkin", "id": 2020, "trainId": 533},
604
+ {"name": "gutter", "id": 1156, "trainId": 534},
605
+ {"name": "fruit stand", "id": 1036, "trainId": 535},
606
+ {"name": "ice floe, floe", "id": 1295, "trainId": 536},
607
+ {"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537},
608
+ {"name": "wheelchair", "id": 3037, "trainId": 538},
609
+ {"name": "mousepad, mouse mat", "id": 1614, "trainId": 539},
610
+ {"name": "diploma", "id": 736, "trainId": 540},
611
+ {"name": "fairground ride", "id": 893, "trainId": 541},
612
+ {"name": "radio", "id": 2047, "trainId": 542},
613
+ {"name": "hotplate", "id": 1274, "trainId": 543},
614
+ {"name": "junk", "id": 1361, "trainId": 544},
615
+ {"name": "wheelbarrow", "id": 3036, "trainId": 545},
616
+ {"name": "stream", "id": 2606, "trainId": 546},
617
+ {"name": "toll plaza", "id": 2797, "trainId": 547},
618
+ {"name": "punching bag", "id": 2022, "trainId": 548},
619
+ {"name": "trough", "id": 2876, "trainId": 549},
620
+ {"name": "throne", "id": 2758, "trainId": 550},
621
+ {"name": "chair desk", "id": 472, "trainId": 551},
622
+ {"name": "weighbridge", "id": 3028, "trainId": 552},
623
+ {"name": "extractor fan", "id": 882, "trainId": 553},
624
+ {"name": "hanging clothes", "id": 1189, "trainId": 554},
625
+ {"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555},
626
+ {"name": "alarm clock, alarm", "id": 3122, "trainId": 556},
627
+ {"name": "ski lift", "id": 2401, "trainId": 557},
628
+ {"name": "chain", "id": 468, "trainId": 558},
629
+ {"name": "garage", "id": 1061, "trainId": 559},
630
+ {"name": "mechanical shovel", "id": 1523, "trainId": 560},
631
+ {"name": "wine rack", "id": 3059, "trainId": 561},
632
+ {"name": "tramway", "id": 2843, "trainId": 562},
633
+ {"name": "treadmill", "id": 2853, "trainId": 563},
634
+ {"name": "menu", "id": 1529, "trainId": 564},
635
+ {"name": "block", "id": 214, "trainId": 565},
636
+ {"name": "well", "id": 3032, "trainId": 566},
637
+ {"name": "witness stand", "id": 3071, "trainId": 567},
638
+ {"name": "branch", "id": 277, "trainId": 568},
639
+ {"name": "duck", "id": 813, "trainId": 569},
640
+ {"name": "casserole", "id": 426, "trainId": 570},
641
+ {"name": "frying pan", "id": 1039, "trainId": 571},
642
+ {"name": "desk organizer", "id": 727, "trainId": 572},
643
+ {"name": "mast", "id": 1508, "trainId": 573},
644
+ {"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574},
645
+ {"name": "service elevator", "id": 2299, "trainId": 575},
646
+ {"name": "dollhouse", "id": 768, "trainId": 576},
647
+ {"name": "hammock", "id": 1172, "trainId": 577},
648
+ {"name": "clothes hanging", "id": 537, "trainId": 578},
649
+ {"name": "photocopier", "id": 1847, "trainId": 579},
650
+ {"name": "notepad", "id": 1664, "trainId": 580},
651
+ {"name": "golf cart", "id": 1110, "trainId": 581},
652
+ {"name": "footpath", "id": 1014, "trainId": 582},
653
+ {"name": "cross", "id": 662, "trainId": 583},
654
+ {"name": "baptismal font", "id": 121, "trainId": 584},
655
+ {"name": "boiler", "id": 227, "trainId": 585},
656
+ {"name": "skip", "id": 2410, "trainId": 586},
657
+ {"name": "rotisserie", "id": 2165, "trainId": 587},
658
+ {"name": "tables", "id": 2696, "trainId": 588},
659
+ {"name": "water mill", "id": 3005, "trainId": 589},
660
+ {"name": "helmet", "id": 1231, "trainId": 590},
661
+ {"name": "cover curtain", "id": 635, "trainId": 591},
662
+ {"name": "brick", "id": 292, "trainId": 592},
663
+ {"name": "table runner", "id": 2690, "trainId": 593},
664
+ {"name": "ashtray", "id": 65, "trainId": 594},
665
+ {"name": "street box", "id": 2607, "trainId": 595},
666
+ {"name": "stick", "id": 2574, "trainId": 596},
667
+ {"name": "hangers", "id": 1188, "trainId": 597},
668
+ {"name": "cells", "id": 456, "trainId": 598},
669
+ {"name": "urinal", "id": 2913, "trainId": 599},
670
+ {"name": "centerpiece", "id": 459, "trainId": 600},
671
+ {"name": "portable fridge", "id": 1955, "trainId": 601},
672
+ {"name": "dvds", "id": 827, "trainId": 602},
673
+ {"name": "golf club", "id": 1111, "trainId": 603},
674
+ {"name": "skirting board", "id": 2412, "trainId": 604},
675
+ {"name": "water cooler", "id": 2997, "trainId": 605},
676
+ {"name": "clipboard", "id": 528, "trainId": 606},
677
+ {"name": "camera, photographic camera", "id": 366, "trainId": 607},
678
+ {"name": "pigeonhole", "id": 1863, "trainId": 608},
679
+ {"name": "chips", "id": 500, "trainId": 609},
680
+ {"name": "food processor", "id": 1001, "trainId": 610},
681
+ {"name": "post box", "id": 1958, "trainId": 611},
682
+ {"name": "lid", "id": 1441, "trainId": 612},
683
+ {"name": "drum", "id": 809, "trainId": 613},
684
+ {"name": "blender", "id": 210, "trainId": 614},
685
+ {"name": "cave entrance", "id": 435, "trainId": 615},
686
+ {"name": "dental chair", "id": 718, "trainId": 616},
687
+ {"name": "obelisk", "id": 1674, "trainId": 617},
688
+ {"name": "canoe", "id": 388, "trainId": 618},
689
+ {"name": "mobile", "id": 1572, "trainId": 619},
690
+ {"name": "monitors", "id": 1584, "trainId": 620},
691
+ {"name": "pool ball", "id": 1944, "trainId": 621},
692
+ {"name": "cue rack", "id": 674, "trainId": 622},
693
+ {"name": "baggage carts", "id": 99, "trainId": 623},
694
+ {"name": "shore", "id": 2352, "trainId": 624},
695
+ {"name": "fork", "id": 1019, "trainId": 625},
696
+ {"name": "paper filer", "id": 1763, "trainId": 626},
697
+ {"name": "bicycle rack", "id": 185, "trainId": 627},
698
+ {"name": "coat rack", "id": 554, "trainId": 628},
699
+ {"name": "garland", "id": 1066, "trainId": 629},
700
+ {"name": "sports bag", "id": 2508, "trainId": 630},
701
+ {"name": "fish tank", "id": 951, "trainId": 631},
702
+ {"name": "towel dispenser", "id": 2822, "trainId": 632},
703
+ {"name": "carriage", "id": 415, "trainId": 633},
704
+ {"name": "brochure", "id": 297, "trainId": 634},
705
+ {"name": "plaque", "id": 1914, "trainId": 635},
706
+ {"name": "stringer", "id": 2619, "trainId": 636},
707
+ {"name": "iron", "id": 1338, "trainId": 637},
708
+ {"name": "spoon", "id": 2505, "trainId": 638},
709
+ {"name": "flag pole", "id": 955, "trainId": 639},
710
+ {"name": "toilet brush", "id": 2786, "trainId": 640},
711
+ {"name": "book stand", "id": 238, "trainId": 641},
712
+ {"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642},
713
+ {"name": "ticket office", "id": 2763, "trainId": 643},
714
+ {"name": "broom", "id": 299, "trainId": 644},
715
+ {"name": "dvd", "id": 822, "trainId": 645},
716
+ {"name": "ice bucket", "id": 1288, "trainId": 646},
717
+ {"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647},
718
+ {"name": "tureen", "id": 2894, "trainId": 648},
719
+ {"name": "folders", "id": 992, "trainId": 649},
720
+ {"name": "chess", "id": 489, "trainId": 650},
721
+ {"name": "root", "id": 2157, "trainId": 651},
722
+ {"name": "sewing machine", "id": 2309, "trainId": 652},
723
+ {"name": "model", "id": 1576, "trainId": 653},
724
+ {"name": "pen", "id": 1810, "trainId": 654},
725
+ {"name": "violin", "id": 2964, "trainId": 655},
726
+ {"name": "sweatshirt", "id": 2662, "trainId": 656},
727
+ {"name": "recycling materials", "id": 2087, "trainId": 657},
728
+ {"name": "mitten", "id": 1569, "trainId": 658},
729
+ {"name": "chopping board, cutting board", "id": 503, "trainId": 659},
730
+ {"name": "mask", "id": 1505, "trainId": 660},
731
+ {"name": "log", "id": 1468, "trainId": 661},
732
+ {"name": "mouse, computer mouse", "id": 1613, "trainId": 662},
733
+ {"name": "grill", "id": 1138, "trainId": 663},
734
+ {"name": "hole", "id": 1256, "trainId": 664},
735
+ {"name": "target", "id": 2715, "trainId": 665},
736
+ {"name": "trash bag", "id": 2846, "trainId": 666},
737
+ {"name": "chalk", "id": 477, "trainId": 667},
738
+ {"name": "sticks", "id": 2576, "trainId": 668},
739
+ {"name": "balloon", "id": 108, "trainId": 669},
740
+ {"name": "score", "id": 2245, "trainId": 670},
741
+ {"name": "hair spray", "id": 1162, "trainId": 671},
742
+ {"name": "roll", "id": 2149, "trainId": 672},
743
+ {"name": "runner", "id": 2183, "trainId": 673},
744
+ {"name": "engine", "id": 858, "trainId": 674},
745
+ {"name": "inflatable glove", "id": 1324, "trainId": 675},
746
+ {"name": "games", "id": 1055, "trainId": 676},
747
+ {"name": "pallets", "id": 1741, "trainId": 677},
748
+ {"name": "baskets", "id": 149, "trainId": 678},
749
+ {"name": "coop", "id": 615, "trainId": 679},
750
+ {"name": "dvd player", "id": 825, "trainId": 680},
751
+ {"name": "rocking horse", "id": 2143, "trainId": 681},
752
+ {"name": "buckets", "id": 304, "trainId": 682},
753
+ {"name": "bread rolls", "id": 283, "trainId": 683},
754
+ {"name": "shawl", "id": 2322, "trainId": 684},
755
+ {"name": "watering can", "id": 3017, "trainId": 685},
756
+ {"name": "spotlights", "id": 2510, "trainId": 686},
757
+ {"name": "post-it", "id": 1960, "trainId": 687},
758
+ {"name": "bowls", "id": 265, "trainId": 688},
759
+ {"name": "security camera", "id": 2282, "trainId": 689},
760
+ {"name": "runner cloth", "id": 2184, "trainId": 690},
761
+ {"name": "lock", "id": 1461, "trainId": 691},
762
+ {"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692},
763
+ {"name": "side", "id": 2372, "trainId": 693},
764
+ {"name": "roulette", "id": 2166, "trainId": 694},
765
+ {"name": "bone", "id": 232, "trainId": 695},
766
+ {"name": "cutlery", "id": 693, "trainId": 696},
767
+ {"name": "pool balls", "id": 1945, "trainId": 697},
768
+ {"name": "wheels", "id": 3039, "trainId": 698},
769
+ {"name": "spice rack", "id": 2494, "trainId": 699},
770
+ {"name": "plant pots", "id": 1908, "trainId": 700},
771
+ {"name": "towel ring", "id": 2827, "trainId": 701},
772
+ {"name": "bread box", "id": 280, "trainId": 702},
773
+ {"name": "video", "id": 2950, "trainId": 703},
774
+ {"name": "funfair", "id": 1044, "trainId": 704},
775
+ {"name": "breads", "id": 288, "trainId": 705},
776
+ {"name": "tripod", "id": 2863, "trainId": 706},
777
+ {"name": "ironing board", "id": 1342, "trainId": 707},
778
+ {"name": "skimmer", "id": 2409, "trainId": 708},
779
+ {"name": "hollow", "id": 1258, "trainId": 709},
780
+ {"name": "scratching post", "id": 2249, "trainId": 710},
781
+ {"name": "tricycle", "id": 2862, "trainId": 711},
782
+ {"name": "file box", "id": 920, "trainId": 712},
783
+ {"name": "mountain pass", "id": 1607, "trainId": 713},
784
+ {"name": "tombstones", "id": 2802, "trainId": 714},
785
+ {"name": "cooker", "id": 610, "trainId": 715},
786
+ {"name": "card game, cards", "id": 3129, "trainId": 716},
787
+ {"name": "golf bag", "id": 1108, "trainId": 717},
788
+ {"name": "towel paper", "id": 2823, "trainId": 718},
789
+ {"name": "chaise lounge", "id": 476, "trainId": 719},
790
+ {"name": "sun", "id": 2641, "trainId": 720},
791
+ {"name": "toilet paper holder", "id": 2788, "trainId": 721},
792
+ {"name": "rake", "id": 2070, "trainId": 722},
793
+ {"name": "key", "id": 1368, "trainId": 723},
794
+ {"name": "umbrella stand", "id": 2903, "trainId": 724},
795
+ {"name": "dartboard", "id": 699, "trainId": 725},
796
+ {"name": "transformer", "id": 2844, "trainId": 726},
797
+ {"name": "fireplace utensils", "id": 942, "trainId": 727},
798
+ {"name": "sweatshirts", "id": 2663, "trainId": 728},
799
+ {
800
+ "name": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
801
+ "id": 457,
802
+ "trainId": 729,
803
+ },
804
+ {"name": "tallboy", "id": 2701, "trainId": 730},
805
+ {"name": "stapler", "id": 2540, "trainId": 731},
806
+ {"name": "sauna", "id": 2231, "trainId": 732},
807
+ {"name": "test tube", "id": 2746, "trainId": 733},
808
+ {"name": "palette", "id": 1738, "trainId": 734},
809
+ {"name": "shopping carts", "id": 2350, "trainId": 735},
810
+ {"name": "tools", "id": 2808, "trainId": 736},
811
+ {"name": "push button, push, button", "id": 2025, "trainId": 737},
812
+ {"name": "star", "id": 2541, "trainId": 738},
813
+ {"name": "roof rack", "id": 2156, "trainId": 739},
814
+ {"name": "barbed wire", "id": 126, "trainId": 740},
815
+ {"name": "spray", "id": 2512, "trainId": 741},
816
+ {"name": "ear", "id": 831, "trainId": 742},
817
+ {"name": "sponge", "id": 2503, "trainId": 743},
818
+ {"name": "racket", "id": 2039, "trainId": 744},
819
+ {"name": "tins", "id": 2774, "trainId": 745},
820
+ {"name": "eyeglasses", "id": 886, "trainId": 746},
821
+ {"name": "file", "id": 919, "trainId": 747},
822
+ {"name": "scarfs", "id": 2240, "trainId": 748},
823
+ {"name": "sugar bowl", "id": 2636, "trainId": 749},
824
+ {"name": "flip flop", "id": 963, "trainId": 750},
825
+ {"name": "headstones", "id": 1218, "trainId": 751},
826
+ {"name": "laptop bag", "id": 1406, "trainId": 752},
827
+ {"name": "leash", "id": 1420, "trainId": 753},
828
+ {"name": "climbing frame", "id": 526, "trainId": 754},
829
+ {"name": "suit hanger", "id": 2639, "trainId": 755},
830
+ {"name": "floor spotlight", "id": 975, "trainId": 756},
831
+ {"name": "plate rack", "id": 1921, "trainId": 757},
832
+ {"name": "sewer", "id": 2305, "trainId": 758},
833
+ {"name": "hard drive", "id": 1193, "trainId": 759},
834
+ {"name": "sprinkler", "id": 2517, "trainId": 760},
835
+ {"name": "tools box", "id": 2809, "trainId": 761},
836
+ {"name": "necklace", "id": 1647, "trainId": 762},
837
+ {"name": "bulbs", "id": 314, "trainId": 763},
838
+ {"name": "steel industry", "id": 2560, "trainId": 764},
839
+ {"name": "club", "id": 545, "trainId": 765},
840
+ {"name": "jack", "id": 1345, "trainId": 766},
841
+ {"name": "door bars", "id": 775, "trainId": 767},
842
+ {
843
+ "name": "control panel, instrument panel, control board, board, panel",
844
+ "id": 603,
845
+ "trainId": 768,
846
+ },
847
+ {"name": "hairbrush", "id": 1163, "trainId": 769},
848
+ {"name": "napkin holder", "id": 1641, "trainId": 770},
849
+ {"name": "office", "id": 1678, "trainId": 771},
850
+ {"name": "smoke detector", "id": 2450, "trainId": 772},
851
+ {"name": "utensils", "id": 2915, "trainId": 773},
852
+ {"name": "apron", "id": 42, "trainId": 774},
853
+ {"name": "scissors", "id": 2242, "trainId": 775},
854
+ {"name": "terminal", "id": 2741, "trainId": 776},
855
+ {"name": "grinder", "id": 1143, "trainId": 777},
856
+ {"name": "entry phone", "id": 862, "trainId": 778},
857
+ {"name": "newspaper stand", "id": 1654, "trainId": 779},
858
+ {"name": "pepper shaker", "id": 1826, "trainId": 780},
859
+ {"name": "onions", "id": 1689, "trainId": 781},
860
+ {
861
+ "name": "central processing unit, cpu, c p u , central processor, processor, mainframe",
862
+ "id": 3124,
863
+ "trainId": 782,
864
+ },
865
+ {"name": "tape", "id": 2710, "trainId": 783},
866
+ {"name": "bat", "id": 152, "trainId": 784},
867
+ {"name": "coaster", "id": 549, "trainId": 785},
868
+ {"name": "calculator", "id": 360, "trainId": 786},
869
+ {"name": "potatoes", "id": 1982, "trainId": 787},
870
+ {"name": "luggage rack", "id": 1478, "trainId": 788},
871
+ {"name": "salt", "id": 2203, "trainId": 789},
872
+ {"name": "street number", "id": 2612, "trainId": 790},
873
+ {"name": "viewpoint", "id": 2956, "trainId": 791},
874
+ {"name": "sword", "id": 2681, "trainId": 792},
875
+ {"name": "cd", "id": 437, "trainId": 793},
876
+ {"name": "rowing machine", "id": 2171, "trainId": 794},
877
+ {"name": "plug", "id": 1933, "trainId": 795},
878
+ {"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796},
879
+ {"name": "pepper", "id": 1824, "trainId": 797},
880
+ {"name": "tongs", "id": 2803, "trainId": 798},
881
+ {"name": "bonfire", "id": 234, "trainId": 799},
882
+ {"name": "dog dish", "id": 764, "trainId": 800},
883
+ {"name": "belt", "id": 177, "trainId": 801},
884
+ {"name": "dumbbells", "id": 817, "trainId": 802},
885
+ {"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803},
886
+ {"name": "hook", "id": 1262, "trainId": 804},
887
+ {"name": "envelopes", "id": 864, "trainId": 805},
888
+ {"name": "shower faucet", "id": 2359, "trainId": 806},
889
+ {"name": "watch", "id": 2992, "trainId": 807},
890
+ {"name": "padlock", "id": 1725, "trainId": 808},
891
+ {"name": "swimming pool ladder", "id": 2667, "trainId": 809},
892
+ {"name": "spanners", "id": 2484, "trainId": 810},
893
+ {"name": "gravy boat", "id": 1133, "trainId": 811},
894
+ {"name": "notice board", "id": 1667, "trainId": 812},
895
+ {"name": "trash bags", "id": 2847, "trainId": 813},
896
+ {"name": "fire alarm", "id": 932, "trainId": 814},
897
+ {"name": "ladle", "id": 1392, "trainId": 815},
898
+ {"name": "stethoscope", "id": 2573, "trainId": 816},
899
+ {"name": "rocket", "id": 2140, "trainId": 817},
900
+ {"name": "funnel", "id": 1046, "trainId": 818},
901
+ {"name": "bowling pins", "id": 264, "trainId": 819},
902
+ {"name": "valve", "id": 2927, "trainId": 820},
903
+ {"name": "thermometer", "id": 2752, "trainId": 821},
904
+ {"name": "cups", "id": 679, "trainId": 822},
905
+ {"name": "spice jar", "id": 2493, "trainId": 823},
906
+ {"name": "night light", "id": 1658, "trainId": 824},
907
+ {"name": "soaps", "id": 2466, "trainId": 825},
908
+ {"name": "games table", "id": 1057, "trainId": 826},
909
+ {"name": "slotted spoon", "id": 2444, "trainId": 827},
910
+ {"name": "reel", "id": 2093, "trainId": 828},
911
+ {"name": "scourer", "id": 2248, "trainId": 829},
912
+ {"name": "sleeping robe", "id": 2432, "trainId": 830},
913
+ {"name": "desk mat", "id": 726, "trainId": 831},
914
+ {"name": "dumbbell", "id": 816, "trainId": 832},
915
+ {"name": "hammer", "id": 1171, "trainId": 833},
916
+ {"name": "tie", "id": 2766, "trainId": 834},
917
+ {"name": "typewriter", "id": 2900, "trainId": 835},
918
+ {"name": "shaker", "id": 2313, "trainId": 836},
919
+ {"name": "cheese dish", "id": 488, "trainId": 837},
920
+ {"name": "sea star", "id": 2265, "trainId": 838},
921
+ {"name": "racquet", "id": 2043, "trainId": 839},
922
+ {"name": "butane gas cylinder", "id": 332, "trainId": 840},
923
+ {"name": "paper weight", "id": 1771, "trainId": 841},
924
+ {"name": "shaving brush", "id": 2320, "trainId": 842},
925
+ {"name": "sunglasses", "id": 2646, "trainId": 843},
926
+ {"name": "gear shift", "id": 1089, "trainId": 844},
927
+ {"name": "towel rail", "id": 2826, "trainId": 845},
928
+ {"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846},
929
+ ]
930
+
931
+
932
+ def loadAde20K(file):
933
+ fileseg = file.replace(".jpg", "_seg.png")
934
+ with Image.open(fileseg) as io:
935
+ seg = np.array(io)
936
+
937
+ R = seg[:, :, 0]
938
+ G = seg[:, :, 1]
939
+ ObjectClassMasks = (R / 10).astype(np.int32) * 256 + (G.astype(np.int32))
940
+
941
+ return {"img_name": file, "segm_name": fileseg, "class_mask": ObjectClassMasks}
942
+
943
+
944
+ if __name__ == "__main__":
945
+ dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
946
+ index_file = dataset_dir / "ADE20K_2021_17_01" / "index_ade20k.pkl"
947
+ print('Caution: we only generate the validation set!')
948
+ with open(index_file, "rb") as f:
949
+ index_ade20k = pkl.load(f)
950
+
951
+ id_map = {}
952
+ for cat in ADE20K_SEM_SEG_FULL_CATEGORIES:
953
+ id_map[cat["id"]] = cat["trainId"]
954
+
955
+ # make output dir
956
+ for name in ["training", "validation"]:
957
+ image_dir = dataset_dir / "ADE20K_2021_17_01" / "images_detectron2" / name
958
+ image_dir.mkdir(parents=True, exist_ok=True)
959
+ annotation_dir = dataset_dir / "ADE20K_2021_17_01" / "annotations_detectron2" / name
960
+ annotation_dir.mkdir(parents=True, exist_ok=True)
961
+
962
+ # process image and gt
963
+ for i, (folder_name, file_name) in tqdm.tqdm(
964
+ enumerate(zip(index_ade20k["folder"], index_ade20k["filename"])),
965
+ total=len(index_ade20k["filename"]),
966
+ ):
967
+ split = "validation" if file_name.split("_")[1] == "val" else "training"
968
+ if split == 'training':
969
+ # FIXME: If you want to generate training set, delete this condition
970
+ continue
971
+ info = loadAde20K(str(dataset_dir / folder_name / file_name))
972
+
973
+ # resize image and label
974
+ img = np.asarray(Image.open(info["img_name"]))
975
+ lab = np.asarray(info["class_mask"])
976
+
977
+ h, w = img.shape[0], img.shape[1]
978
+ max_size = 512
979
+ resize = True
980
+ if w >= h > max_size:
981
+ h_new, w_new = max_size, round(w / float(h) * max_size)
982
+ elif h >= w > max_size:
983
+ h_new, w_new = round(h / float(w) * max_size), max_size
984
+ else:
985
+ resize = False
986
+
987
+ if resize:
988
+ img = cv2.resize(img, (w_new, h_new), interpolation=cv2.INTER_LINEAR)
989
+ lab = cv2.resize(lab, (w_new, h_new), interpolation=cv2.INTER_NEAREST)
990
+
991
+ assert img.dtype == np.uint8
992
+ assert lab.dtype == np.int32
993
+
994
+ # apply label conversion and save into uint16 images
995
+ output = np.zeros_like(lab, dtype=np.uint16) + 65535
996
+ for obj_id in np.unique(lab):
997
+ if obj_id in id_map:
998
+ output[lab == obj_id] = id_map[obj_id]
999
+
1000
+ output_img = dataset_dir / "ADE20K_2021_17_01" / "images_detectron2" / split / file_name
1001
+ output_lab = (
1002
+ dataset_dir
1003
+ / "ADE20K_2021_17_01"
1004
+ / "annotations_detectron2"
1005
+ / split
1006
+ / file_name.replace(".jpg", ".tif")
1007
+ )
1008
+ Image.fromarray(img).save(output_img)
1009
+
1010
+ assert output.dtype == np.uint16
1011
+ Image.fromarray(output).save(output_lab)
datasets/prepare_ade20k_sem_seg.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ import tqdm
9
+ from PIL import Image
10
+
11
+
12
+ def convert(input, output, index=None):
13
+ img = np.asarray(Image.open(input))
14
+ assert img.dtype == np.uint8
15
+ img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1
16
+ if index is not None:
17
+ mapping = {i: k for k, i in enumerate(index)}
18
+ img = np.vectorize(lambda x: mapping[x] if x in mapping else 255)(
19
+ img.astype(np.float)
20
+ ).astype(np.uint8)
21
+ Image.fromarray(img).save(output)
22
+
23
+
24
+ if __name__ == "__main__":
25
+ dataset_dir = (
26
+ Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
27
+ )
28
+ print('Caution: we only generate the validation set!')
29
+ for name in ["validation"]:
30
+ annotation_dir = dataset_dir / "annotations" / name
31
+ output_dir = dataset_dir / "annotations_detectron2" / name
32
+ output_dir.mkdir(parents=True, exist_ok=True)
33
+ for file in tqdm.tqdm(list(annotation_dir.iterdir())):
34
+ output_file = output_dir / file.name
35
+ convert(file, output_file)
datasets/prepare_coco_stuff_sem_seg.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+ # Modified by Feng Liang from
4
+ # https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_coco_stuff_164k_sem_seg.py
5
+
6
+ import os
7
+ import os.path as osp
8
+ from pathlib import Path
9
+ import tqdm
10
+ from glob import glob
11
+
12
+ import numpy as np
13
+ from PIL import Image
14
+
15
+
16
+ full_clsID_to_trID = {
17
+ 0: 0,
18
+ 1: 1,
19
+ 2: 2,
20
+ 3: 3,
21
+ 4: 4,
22
+ 5: 5,
23
+ 6: 6,
24
+ 7: 7,
25
+ 8: 8,
26
+ 9: 9,
27
+ 10: 10,
28
+ 12: 11,
29
+ 13: 12,
30
+ 14: 13,
31
+ 15: 14,
32
+ 16: 15,
33
+ 17: 16,
34
+ 18: 17,
35
+ 19: 18,
36
+ 20: 19,
37
+ 21: 20,
38
+ 22: 21,
39
+ 23: 22,
40
+ 24: 23,
41
+ 26: 24,
42
+ 27: 25,
43
+ 30: 26,
44
+ 31: 27,
45
+ 32: 28,
46
+ 33: 29,
47
+ 34: 30,
48
+ 35: 31,
49
+ 36: 32,
50
+ 37: 33,
51
+ 38: 34,
52
+ 39: 35,
53
+ 40: 36,
54
+ 41: 37,
55
+ 42: 38,
56
+ 43: 39,
57
+ 45: 40,
58
+ 46: 41,
59
+ 47: 42,
60
+ 48: 43,
61
+ 49: 44,
62
+ 50: 45,
63
+ 51: 46,
64
+ 52: 47,
65
+ 53: 48,
66
+ 54: 49,
67
+ 55: 50,
68
+ 56: 51,
69
+ 57: 52,
70
+ 58: 53,
71
+ 59: 54,
72
+ 60: 55,
73
+ 61: 56,
74
+ 62: 57,
75
+ 63: 58,
76
+ 64: 59,
77
+ 66: 60,
78
+ 69: 61,
79
+ 71: 62,
80
+ 72: 63,
81
+ 73: 64,
82
+ 74: 65,
83
+ 75: 66,
84
+ 76: 67,
85
+ 77: 68,
86
+ 78: 69,
87
+ 79: 70,
88
+ 80: 71,
89
+ 81: 72,
90
+ 83: 73,
91
+ 84: 74,
92
+ 85: 75,
93
+ 86: 76,
94
+ 87: 77,
95
+ 88: 78,
96
+ 89: 79,
97
+ 91: 80,
98
+ 92: 81,
99
+ 93: 82,
100
+ 94: 83,
101
+ 95: 84,
102
+ 96: 85,
103
+ 97: 86,
104
+ 98: 87,
105
+ 99: 88,
106
+ 100: 89,
107
+ 101: 90,
108
+ 102: 91,
109
+ 103: 92,
110
+ 104: 93,
111
+ 105: 94,
112
+ 106: 95,
113
+ 107: 96,
114
+ 108: 97,
115
+ 109: 98,
116
+ 110: 99,
117
+ 111: 100,
118
+ 112: 101,
119
+ 113: 102,
120
+ 114: 103,
121
+ 115: 104,
122
+ 116: 105,
123
+ 117: 106,
124
+ 118: 107,
125
+ 119: 108,
126
+ 120: 109,
127
+ 121: 110,
128
+ 122: 111,
129
+ 123: 112,
130
+ 124: 113,
131
+ 125: 114,
132
+ 126: 115,
133
+ 127: 116,
134
+ 128: 117,
135
+ 129: 118,
136
+ 130: 119,
137
+ 131: 120,
138
+ 132: 121,
139
+ 133: 122,
140
+ 134: 123,
141
+ 135: 124,
142
+ 136: 125,
143
+ 137: 126,
144
+ 138: 127,
145
+ 139: 128,
146
+ 140: 129,
147
+ 141: 130,
148
+ 142: 131,
149
+ 143: 132,
150
+ 144: 133,
151
+ 145: 134,
152
+ 146: 135,
153
+ 147: 136,
154
+ 148: 137,
155
+ 149: 138,
156
+ 150: 139,
157
+ 151: 140,
158
+ 152: 141,
159
+ 153: 142,
160
+ 154: 143,
161
+ 155: 144,
162
+ 156: 145,
163
+ 157: 146,
164
+ 158: 147,
165
+ 159: 148,
166
+ 160: 149,
167
+ 161: 150,
168
+ 162: 151,
169
+ 163: 152,
170
+ 164: 153,
171
+ 165: 154,
172
+ 166: 155,
173
+ 167: 156,
174
+ 168: 157,
175
+ 169: 158,
176
+ 170: 159,
177
+ 171: 160,
178
+ 172: 161,
179
+ 173: 162,
180
+ 174: 163,
181
+ 175: 164,
182
+ 176: 165,
183
+ 177: 166,
184
+ 178: 167,
185
+ 179: 168,
186
+ 180: 169,
187
+ 181: 170,
188
+ 255: 255,
189
+ }
190
+
191
+ def convert_to_trainID(
192
+ maskpath, out_mask_dir, is_train, clsID_to_trID=full_clsID_to_trID, suffix=""
193
+ ):
194
+ mask = np.array(Image.open(maskpath))
195
+ mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
196
+ for clsID, trID in clsID_to_trID.items():
197
+ mask_copy[mask == clsID] = trID
198
+ seg_filename = (
199
+ osp.join(out_mask_dir, "train2017" + suffix, osp.basename(maskpath))
200
+ if is_train
201
+ else osp.join(out_mask_dir, "val2017" + suffix, osp.basename(maskpath))
202
+ )
203
+ if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255:
204
+ return
205
+ Image.fromarray(mask_copy).save(seg_filename, "PNG")
206
+
207
+
208
+
209
+ if __name__ == "__main__":
210
+ dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
211
+ print('Caution: we only generate the training set!')
212
+ coco_path = dataset_dir / "coco"
213
+ mask_dir = coco_path / "stuffthingmaps"
214
+ out_mask_dir = coco_path / "stuffthingmaps_detectron2"
215
+ for name in ["train2017"]:
216
+ os.makedirs((out_mask_dir / name), exist_ok=True)
217
+ train_list = glob(osp.join(mask_dir, "train2017", "*.png"))
218
+ for file in tqdm.tqdm(train_list):
219
+ convert_to_trainID(file, out_mask_dir, is_train=True)
datasets/prepare_pascal_context.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ import tqdm
5
+ import os
6
+ import os.path as osp
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ from PIL import Image
11
+ import scipy.io
12
+
13
+ def convert_pc59(mask_path, new_mask_path, pc59_dict):
14
+ mat = scipy.io.loadmat(mask_path)
15
+ mask = mat['LabelMap']
16
+
17
+ mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
18
+ for trID, clsID in pc59_dict.items():
19
+ mask_copy[mask == clsID] = trID
20
+
21
+ min_value = np.amin(mask_copy)
22
+ assert min_value >= 0, print(min_value)
23
+ Image.fromarray(mask_copy).save(new_mask_path, "PNG")
24
+
25
+ def convert_pc459(mask_path, new_mask_path):
26
+ mat = scipy.io.loadmat(mask_path)
27
+ mask = mat['LabelMap']
28
+ mask = mask - 1
29
+ min_value = np.amin(mask)
30
+ assert min_value >= 0, print(min_value)
31
+ Image.fromarray(mask).save(new_mask_path, "TIFF")
32
+
33
+
34
+ if __name__ == "__main__":
35
+ dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
36
+ print('Caution: we only generate the validation set!')
37
+ pc_path = dataset_dir / "VOCdevkit/VOC2010"
38
+
39
+ val_list = open(pc_path / "pascalcontext_val.txt", "r")
40
+ pc459_labels = open(pc_path / "labels.txt", "r")
41
+ pc59_labels = open(pc_path / "59_labels.txt", "r")
42
+
43
+ pc459_dict = {}
44
+ for line in pc459_labels.readlines():
45
+ if ':' in line:
46
+ idx, name = line.split(':')
47
+ idx = int(idx.strip())
48
+ name = name.strip()
49
+ pc459_dict[name] = idx
50
+
51
+ pc59_dict = {}
52
+ for i, line in enumerate(pc59_labels.readlines()):
53
+ name = line.split(':')[-1].strip()
54
+ if name is not '':
55
+ pc59_dict[i] = pc459_dict[name]
56
+
57
+ pc459_dir = pc_path / "annotations_detectron2" / "pc459_val"
58
+ pc459_dir.mkdir(parents=True, exist_ok=True)
59
+ pc59_dir = pc_path / "annotations_detectron2" / "pc59_val"
60
+ pc59_dir.mkdir(parents=True, exist_ok=True)
61
+
62
+ for line in tqdm.tqdm(val_list.readlines()):
63
+ fileid = line.strip()
64
+ ori_mask = f'{pc_path}/trainval/{fileid}.mat'
65
+ pc459_dst = f'{pc459_dir}/{fileid}.tif'
66
+ pc59_dst = f'{pc59_dir}/{fileid}.png'
67
+ if osp.exists(ori_mask):
68
+ convert_pc459(ori_mask, pc459_dst)
69
+ convert_pc59(ori_mask, pc59_dst, pc59_dict)
datasets/prepare_voc_sem_seg.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+ # Modified by Feng Liang from https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_voc_sem_seg.py
4
+
5
+ import os
6
+ import os.path as osp
7
+ from pathlib import Path
8
+ import tqdm
9
+
10
+ import numpy as np
11
+ from PIL import Image
12
+
13
+
14
+ clsID_to_trID = {
15
+ 0: 255,
16
+ 1: 0,
17
+ 2: 1,
18
+ 3: 2,
19
+ 4: 3,
20
+ 5: 4,
21
+ 6: 5,
22
+ 7: 6,
23
+ 8: 7,
24
+ 9: 8,
25
+ 10: 9,
26
+ 11: 10,
27
+ 12: 11,
28
+ 13: 12,
29
+ 14: 13,
30
+ 15: 14,
31
+ 16: 15,
32
+ 17: 16,
33
+ 18: 17,
34
+ 19: 18,
35
+ 20: 19,
36
+ 255: 255,
37
+ }
38
+
39
+ def convert_to_trainID(
40
+ maskpath, out_mask_dir, is_train, clsID_to_trID=clsID_to_trID, suffix=""
41
+ ):
42
+ mask = np.array(Image.open(maskpath))
43
+ mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
44
+ for clsID, trID in clsID_to_trID.items():
45
+ mask_copy[mask == clsID] = trID
46
+ seg_filename = (
47
+ osp.join(out_mask_dir, "train" + suffix, osp.basename(maskpath))
48
+ if is_train
49
+ else osp.join(out_mask_dir, "val" + suffix, osp.basename(maskpath))
50
+ )
51
+ if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255:
52
+ return
53
+ Image.fromarray(mask_copy).save(seg_filename, "PNG")
54
+
55
+
56
+
57
+ if __name__ == "__main__":
58
+ dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
59
+ print('Caution: we only generate the validation set!')
60
+ voc_path = dataset_dir / "VOCdevkit" / "VOC2012"
61
+ out_mask_dir = voc_path / "annotations_detectron2"
62
+ out_image_dir = voc_path / "images_detectron2"
63
+ for name in ["val"]:
64
+ os.makedirs((out_mask_dir / name), exist_ok=True)
65
+ os.makedirs((out_image_dir / name), exist_ok=True)
66
+ val_list = [
67
+ osp.join(voc_path, "SegmentationClassAug", f + ".png")
68
+ for f in np.loadtxt(osp.join(voc_path, "ImageSets/Segmentation/val.txt"), dtype=np.str).tolist()
69
+ ]
70
+ for file in tqdm.tqdm(val_list):
71
+ convert_to_trainID(file, out_mask_dir, is_train=False)
demo.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ import argparse
5
+ import glob
6
+ import multiprocessing as mp
7
+ import os
8
+ import time
9
+ import cv2
10
+ import tqdm
11
+ import numpy as np
12
+
13
+ from detectron2.config import get_cfg
14
+
15
+ from detectron2.projects.deeplab import add_deeplab_config
16
+ from detectron2.data.detection_utils import read_image
17
+ from detectron2.utils.logger import setup_logger
18
+ from open_vocab_seg import add_ovseg_config
19
+
20
+ from open_vocab_seg.utils import VisualizationDemo
21
+
22
+ # constants
23
+ WINDOW_NAME = "Open vocabulary segmentation"
24
+
25
+
26
+ def setup_cfg(args):
27
+ # load config from file and command-line arguments
28
+ cfg = get_cfg()
29
+ # for poly lr schedule
30
+ add_deeplab_config(cfg)
31
+ add_ovseg_config(cfg)
32
+ cfg.merge_from_file(args.config_file)
33
+ cfg.merge_from_list(args.opts)
34
+ cfg.freeze()
35
+ return cfg
36
+
37
+
38
+ def get_parser():
39
+ parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation")
40
+ parser.add_argument(
41
+ "--config-file",
42
+ default="configs/ovseg_swinB_vitL_demo.yaml",
43
+ metavar="FILE",
44
+ help="path to config file",
45
+ )
46
+ parser.add_argument(
47
+ "--input",
48
+ nargs="+",
49
+ help="A list of space separated input images; "
50
+ "or a single glob pattern such as 'directory/*.jpg'",
51
+ )
52
+ parser.add_argument(
53
+ "--class-names",
54
+ nargs="+",
55
+ help="A list of user-defined class_names"
56
+ )
57
+ parser.add_argument(
58
+ "--output",
59
+ help="A file or directory to save output visualizations. "
60
+ "If not given, will show output in an OpenCV window.",
61
+ )
62
+ parser.add_argument(
63
+ "--opts",
64
+ help="Modify config options using the command-line 'KEY VALUE' pairs",
65
+ default=[],
66
+ nargs=argparse.REMAINDER,
67
+ )
68
+ return parser
69
+
70
+
71
+ if __name__ == "__main__":
72
+ mp.set_start_method("spawn", force=True)
73
+ args = get_parser().parse_args()
74
+ setup_logger(name="fvcore")
75
+ logger = setup_logger()
76
+ logger.info("Arguments: " + str(args))
77
+
78
+ cfg = setup_cfg(args)
79
+
80
+ demo = VisualizationDemo(cfg)
81
+ class_names = args.class_names
82
+ if args.input:
83
+ if len(args.input) == 1:
84
+ args.input = glob.glob(os.path.expanduser(args.input[0]))
85
+ assert args.input, "The input path(s) was not found"
86
+ for path in tqdm.tqdm(args.input, disable=not args.output):
87
+ # use PIL, to be consistent with evaluation
88
+ start_time = time.time()
89
+ predictions, visualized_output_rgb, visualized_output_depth, visualized_output_rgb_sam, visualized_output_depth_sam = demo.run_on_image_sam(path, class_names)
90
+ logger.info(
91
+ "{}: {} in {:.2f}s".format(
92
+ path,
93
+ "detected {} instances".format(len(predictions["instances"]))
94
+ if "instances" in predictions
95
+ else "finished",
96
+ time.time() - start_time,
97
+ )
98
+ )
99
+
100
+ if args.output:
101
+ if os.path.isdir(args.output):
102
+ assert os.path.isdir(args.output), args.output
103
+ out_filename = os.path.join(args.output, os.path.basename(path))
104
+ else:
105
+ assert len(args.input) == 1, "Please specify a directory with args.output"
106
+ out_filename = args.output
107
+ visualized_output_rgb.save('RGB_Semantic_SAM.png')
108
+ visualized_output_depth.save('Depth_Semantic_SAM.png')
109
+ visualized_output_rgb_sam.save('RGB_Semantic_SAM_Mask.png')
110
+ visualized_output_depth_sam.save('Depth_Semantic_SAM_Mask.png')
111
+ rgb_3d_sam = demo.get_xyzrgb('RGB_Semantic_SAM.png', path)
112
+ depth_3d_sam = demo.get_xyzrgb('Depth_Semantic_SAM.png', path)
113
+ rgb_3d_sam_mask = demo.get_xyzrgb('RGB_Semantic_SAM_Mask.png', path)
114
+ depth_3d_sam_mask = demo.get_xyzrgb('Depth_Semantic_SAM_Mask.png', path)
115
+ np.savez('xyzrgb.npz', rgb_3d_sam = rgb_3d_sam, depth_3d_sam = depth_3d_sam, rgb_3d_sam_mask = rgb_3d_sam_mask, depth_3d_sam_mask = depth_3d_sam_mask)
116
+ demo.render_3d_video('xyzrgb.npz', path)
117
+ else:
118
+ cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
119
+ cv2.imshow(WINDOW_NAME, visualized_output_rgb.get_image()[:, :, ::-1])
120
+ if cv2.waitKey(0) == 27:
121
+ break # esc to quit
122
+ else:
123
+ raise NotImplementedError
flagged/log.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ name,output,flag,username,timestamp
2
+ t,/mnt/lustre/jkyang/PSG4D/segment_anything_sailvos3d/ov-seg/flagged/output/tmpii192qpn.png,,,2023-04-23 12:23:23.301078
3
+ t,/mnt/lustre/jkyang/PSG4D/segment_anything_sailvos3d/ov-seg/flagged/output/tmpqm122tsi.png,,,2023-04-23 12:26:06.661559
flagged/output/tmpii192qpn.png ADDED
flagged/output/tmpqm122tsi.png ADDED
open_vocab_seg/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ from . import data
5
+ from . import modeling
6
+ from .config import add_ovseg_config
7
+
8
+ from .test_time_augmentation import SemanticSegmentorWithTTA
9
+ from .ovseg_model import OVSeg, OVSegDEMO
open_vocab_seg/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (415 Bytes). View file
 
open_vocab_seg/__pycache__/config.cpython-39.pyc ADDED
Binary file (3.15 kB). View file
 
open_vocab_seg/__pycache__/mask_former_model.cpython-39.pyc ADDED
Binary file (8.57 kB). View file
 
open_vocab_seg/__pycache__/ovseg_model.cpython-39.pyc ADDED
Binary file (10.9 kB). View file
 
open_vocab_seg/__pycache__/test_time_augmentation.cpython-39.pyc ADDED
Binary file (6.75 kB). View file
 
open_vocab_seg/config.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ from detectron2.config import CfgNode as CN
5
+
6
+
7
+ def add_mask_former_default_config(cfg):
8
+ # data config
9
+ # select the dataset mapper
10
+ cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
11
+ # Color augmentation
12
+ cfg.INPUT.COLOR_AUG_SSD = False
13
+ # We retry random cropping until no single category in semantic segmentation GT occupies more
14
+ # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
15
+ cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
16
+ # Pad image and segmentation GT in dataset mapper.
17
+ cfg.INPUT.SIZE_DIVISIBILITY = -1
18
+
19
+ # solver config
20
+ # test batch size
21
+ cfg.SOLVER.TEST_IMS_PER_BATCH = 1
22
+ # weight decay on embedding
23
+ cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
24
+ # optimizer
25
+ cfg.SOLVER.OPTIMIZER = "ADAMW"
26
+ cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
27
+
28
+ # mask_former model config
29
+ cfg.MODEL.MASK_FORMER = CN()
30
+
31
+ # loss
32
+ cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
33
+ cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
34
+ cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
35
+ cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
36
+
37
+ # transformer config
38
+ cfg.MODEL.MASK_FORMER.NHEADS = 8
39
+ cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
40
+ cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
41
+ cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
42
+ cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
43
+ cfg.MODEL.MASK_FORMER.PRE_NORM = False
44
+
45
+ cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
46
+ cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
47
+
48
+ cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
49
+ cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
50
+
51
+ # mask_former inference config
52
+ cfg.MODEL.MASK_FORMER.TEST = CN()
53
+ cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
54
+ cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
55
+ cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
56
+ cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
57
+
58
+ # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
59
+ # you can use this config to override
60
+ cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
61
+
62
+ # pixel decoder config
63
+ cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
64
+ # adding transformer in pixel decoder
65
+ cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
66
+ # pixel decoder
67
+ cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
68
+
69
+ # swin transformer backbone
70
+ cfg.MODEL.SWIN = CN()
71
+ cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
72
+ cfg.MODEL.SWIN.PATCH_SIZE = 4
73
+ cfg.MODEL.SWIN.EMBED_DIM = 96
74
+ cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
75
+ cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
76
+ cfg.MODEL.SWIN.WINDOW_SIZE = 7
77
+ cfg.MODEL.SWIN.MLP_RATIO = 4.0
78
+ cfg.MODEL.SWIN.QKV_BIAS = True
79
+ cfg.MODEL.SWIN.QK_SCALE = None
80
+ cfg.MODEL.SWIN.NORM_INDICES = None
81
+ cfg.MODEL.SWIN.PROJECTION = False
82
+ cfg.MODEL.SWIN.PROJECT_DIM = 256
83
+ cfg.MODEL.SWIN.DROP_RATE = 0.0
84
+ cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
85
+ cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
86
+ cfg.MODEL.SWIN.APE = False
87
+ cfg.MODEL.SWIN.PATCH_NORM = True
88
+ cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
89
+
90
+
91
+ def add_our_config(cfg):
92
+ cfg.TEST.SLIDING_WINDOW = False
93
+ cfg.TEST.SLIDING_TILE_SIZE = 224
94
+ cfg.TEST.SLIDING_OVERLAP = 2 / 3.0
95
+ # whether to use dense crf
96
+ cfg.TEST.DENSE_CRF = False
97
+ cfg.DATASETS.SAMPLE_PER_CLASS = -1
98
+ cfg.DATASETS.SAMPLE_SEED = 0
99
+ # embedding head
100
+ cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM = 512
101
+ cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM = 1024
102
+ cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS = 2
103
+ # clip_adapter
104
+ cfg.MODEL.CLIP_ADAPTER = CN()
105
+ cfg.MODEL.CLIP_ADAPTER.TEXT_TEMPLATES = "vild"
106
+ # for predefined
107
+ cfg.MODEL.CLIP_ADAPTER.PREDEFINED_PROMPT_TEMPLATES = ["a photo of a {}."]
108
+ # for learnable prompt
109
+ cfg.MODEL.CLIP_ADAPTER.PROMPT_CHECKPOINT = ""
110
+ cfg.MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME = "ViT-B/16"
111
+ cfg.MODEL.CLIP_ADAPTER.MASK_FILL = "mean"
112
+ cfg.MODEL.CLIP_ADAPTER.MASK_EXPAND_RATIO = 1.0
113
+ cfg.MODEL.CLIP_ADAPTER.MASK_THR = 0.4
114
+ cfg.MODEL.CLIP_ADAPTER.MASK_MATTING = False
115
+ cfg.MODEL.CLIP_ADAPTER.REGION_RESIZED = True
116
+ cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE = True
117
+ cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT = 0.7
118
+ # for mask prompt
119
+ cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_DEPTH = 3
120
+ cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD = False
121
+
122
+ # wandb
123
+ cfg.WANDB = CN()
124
+ cfg.WANDB.PROJECT = "open_vocab_seg"
125
+ cfg.WANDB.NAME = None
126
+
127
+
128
+ def add_ovseg_config(cfg):
129
+ """
130
+ Add config for open_vocab_seg.
131
+ """
132
+ add_mask_former_default_config(cfg)
133
+ add_our_config(cfg)
open_vocab_seg/data/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ from .dataset_mappers import *
5
+ from . import datasets
6
+ from .build import (
7
+ build_detection_train_loader,
8
+ build_detection_test_loader,
9
+ )
open_vocab_seg/data/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (342 Bytes). View file
 
open_vocab_seg/data/__pycache__/build.cpython-39.pyc ADDED
Binary file (11.3 kB). View file
 
open_vocab_seg/data/augmentations.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ import math
5
+ import numbers
6
+ import numpy as np
7
+ from detectron2.data.transforms.augmentation import Augmentation
8
+ from detectron2.data.transforms.transform import (
9
+ CropTransform,
10
+ ResizeTransform,
11
+ TransformList,
12
+ )
13
+ from PIL import Image
14
+ from fvcore.transforms.transform import PadTransform
15
+
16
+
17
+ def mask2box(mask: np.ndarray):
18
+ # use naive way
19
+ row = np.nonzero(mask.sum(axis=0))[0]
20
+ if len(row) == 0:
21
+ return None
22
+ x1 = row.min()
23
+ x2 = row.max()
24
+ col = np.nonzero(mask.sum(axis=1))[0]
25
+ y1 = col.min()
26
+ y2 = col.max()
27
+ return x1, y1, x2 + 1 - x1, y2 + 1 - y1
28
+
29
+
30
+ def expand_box(x, y, w, h, expand_ratio=1.0, max_h=None, max_w=None):
31
+ cx = x + 0.5 * w
32
+ cy = y + 0.5 * h
33
+ w = w * expand_ratio
34
+ h = h * expand_ratio
35
+ box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h]
36
+ if max_h is not None:
37
+ box[1] = max(0, box[1])
38
+ box[3] = min(max_h - 1, box[3])
39
+ if max_w is not None:
40
+ box[0] = max(0, box[0])
41
+ box[2] = min(max_w - 1, box[2])
42
+ box[2] = box[2] - box[0]
43
+ box[3] = box[3] - box[1]
44
+
45
+ return [int(b) for b in box]
46
+
47
+
48
+ class CropImageWithMask(Augmentation):
49
+ def __init__(self, expand_ratio=1.0, mode="choice"):
50
+ if isinstance(expand_ratio, numbers.Number):
51
+ expand_ratio = (expand_ratio, expand_ratio)
52
+ self.mode = mode
53
+ self.expand_ratio = expand_ratio
54
+ if self.mode == "range":
55
+ assert len(expand_ratio) == 2 and expand_ratio[0] < expand_ratio[1]
56
+
57
+ def get_transform(self, image, sem_seg, category_id):
58
+ input_size = image.shape[:2]
59
+ bin_mask = sem_seg == category_id
60
+ x, y, w, h = mask2box(bin_mask)
61
+ if self.mode == "choice":
62
+ expand_ratio = np.random.choice(self.expand_ratio)
63
+ else:
64
+ expand_ratio = np.random.uniform(self.expand_ratio[0], self.expand_ratio[1])
65
+ x, y, w, h = expand_box(x, y, w, h, expand_ratio, *input_size)
66
+ w = max(w, 1)
67
+ h = max(h, 1)
68
+ return CropTransform(x, y, w, h, input_size[1], input_size[0])
69
+
70
+
71
+ class CropImageWithBox(Augmentation):
72
+ def __init__(self, expand_ratio=1.0, mode="choice"):
73
+ if isinstance(expand_ratio, numbers.Number):
74
+ expand_ratio = (expand_ratio, expand_ratio)
75
+ self.mode = mode
76
+ self.expand_ratio = expand_ratio
77
+ if self.mode == "range":
78
+ assert len(expand_ratio) == 2 and expand_ratio[0] < expand_ratio[1]
79
+
80
+ def get_transform(self, image, boxes):
81
+ input_size = image.shape[:2]
82
+ x, y, x2, y2 = boxes[0]
83
+ w = x2 - x + 1
84
+ h = y2 - y + 1
85
+ if self.mode == "choice":
86
+ expand_ratio = np.random.choice(self.expand_ratio)
87
+ else:
88
+ expand_ratio = np.random.uniform(self.expand_ratio[0], self.expand_ratio[1])
89
+ x, y, w, h = expand_box(x, y, w, h, expand_ratio, *input_size)
90
+ w = max(w, 1)
91
+ h = max(h, 1)
92
+ return CropTransform(x, y, w, h, input_size[1], input_size[0])
93
+
94
+
95
+ class RandomResizedCrop(Augmentation):
96
+ def __init__(
97
+ self,
98
+ size,
99
+ scale=(0.08, 1.0),
100
+ ratio=(3.0 / 4.0, 4.0 / 3.0),
101
+ interpolation=Image.BILINEAR,
102
+ ):
103
+ if isinstance(size, int):
104
+ size = (size, size)
105
+ else:
106
+ assert isinstance(size, (tuple, list)) and len(size) == 2
107
+
108
+ self.size = size
109
+
110
+ self.scale = scale
111
+ self.ratio = ratio
112
+ self.interpolation = interpolation
113
+
114
+ def get_transform(self, image):
115
+ height, width = image.shape[:2]
116
+ area = height * width
117
+
118
+ log_ratio = np.log(np.array(self.ratio))
119
+ is_success = False
120
+ for _ in range(10):
121
+ target_area = area * np.random.uniform(self.scale[0], self.scale[1])
122
+ aspect_ratio = np.exp(np.random.uniform(log_ratio[0], log_ratio[1]))
123
+
124
+ w = int(round(math.sqrt(target_area * aspect_ratio)))
125
+ h = int(round(math.sqrt(target_area / aspect_ratio)))
126
+
127
+ if 0 < w <= width and 0 < h <= height:
128
+ i = np.random.randint(0, width - w + 1)
129
+ j = np.random.randint(0, height - h + 1)
130
+
131
+ is_success = True
132
+ break
133
+
134
+ if not is_success:
135
+ # Fallback to central crop
136
+ in_ratio = float(width) / float(height)
137
+ if in_ratio < min(self.ratio):
138
+ w = width
139
+ h = int(round(w / min(self.ratio)))
140
+ elif in_ratio > max(self.ratio):
141
+ h = height
142
+ w = int(round(h * max(self.ratio)))
143
+ else: # whole image
144
+ w = width
145
+ h = height
146
+ i = (width - w) // 2
147
+ j = (height - h) // 2
148
+ return TransformList(
149
+ [
150
+ CropTransform(i, j, w, h, width, height),
151
+ ResizeTransform(
152
+ h, w, self.size[1], self.size[0], interp=self.interpolation
153
+ ),
154
+ ]
155
+ )
156
+
157
+
158
+ class CenterCrop(Augmentation):
159
+ def __init__(self, size, seg_ignore_label):
160
+ if isinstance(size, numbers.Number):
161
+ size = (int(size), int(size))
162
+ elif isinstance(size, (tuple, list)) and len(size) == 1:
163
+ size = (size[0], size[0])
164
+ self.size = size
165
+ self.seg_ignore_label = seg_ignore_label
166
+
167
+ def get_transform(self, image):
168
+
169
+ image_height, image_width = image.shape[:2]
170
+ crop_height, crop_width = self.size
171
+
172
+ transforms = []
173
+ if crop_width > image_width or crop_height > image_height:
174
+ padding_ltrb = [
175
+ (crop_width - image_width) // 2 if crop_width > image_width else 0,
176
+ (crop_height - image_height) // 2 if crop_height > image_height else 0,
177
+ (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
178
+ (crop_height - image_height + 1) // 2
179
+ if crop_height > image_height
180
+ else 0,
181
+ ]
182
+ transforms.append(
183
+ PadTransform(
184
+ *padding_ltrb,
185
+ orig_w=image_width,
186
+ orig_h=image_height,
187
+ seg_pad_value=self.seg_ignore_label
188
+ )
189
+ )
190
+ image_width, image_height = (
191
+ image_width + padding_ltrb[0] + padding_ltrb[2],
192
+ image_height + padding_ltrb[1] + padding_ltrb[3],
193
+ )
194
+
195
+ crop_top = int(round((image_height - crop_height) / 2.0))
196
+ crop_left = int(round((image_width - crop_width) / 2.0))
197
+ transforms.append(
198
+ CropTransform(
199
+ crop_left, crop_top, crop_width, crop_height, image_width, image_height
200
+ )
201
+ )
202
+ return TransformList(transforms)
open_vocab_seg/data/build.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ import itertools
5
+ import logging
6
+ import numpy as np
7
+ from collections import Counter
8
+ import torch.utils.data
9
+ from tabulate import tabulate
10
+ from termcolor import colored
11
+
12
+ from detectron2.utils.logger import _log_api_usage, log_first_n
13
+ from detectron2.data.catalog import DatasetCatalog, MetadataCatalog
14
+ import torch.utils.data
15
+ from detectron2.config import configurable
16
+ from detectron2.data.build import (
17
+ build_batch_data_loader,
18
+ trivial_batch_collator,
19
+ load_proposals_into_dataset,
20
+ filter_images_with_only_crowd_annotations,
21
+ filter_images_with_few_keypoints,
22
+ print_instances_class_histogram,
23
+ )
24
+
25
+ from detectron2.data.common import DatasetFromList, MapDataset
26
+ from detectron2.data.dataset_mapper import DatasetMapper
27
+ from detectron2.data.detection_utils import check_metadata_consistency
28
+ from detectron2.data.samplers import (
29
+ InferenceSampler,
30
+ RandomSubsetTrainingSampler,
31
+ RepeatFactorTrainingSampler,
32
+ TrainingSampler,
33
+ )
34
+
35
+ """
36
+ This file contains the default logic to build a dataloader for training or testing.
37
+ """
38
+
39
+ __all__ = [
40
+ "build_detection_train_loader",
41
+ "build_detection_test_loader",
42
+ ]
43
+
44
+
45
+ def print_classification_instances_class_histogram(dataset_dicts, class_names):
46
+ """
47
+ Args:
48
+ dataset_dicts (list[dict]): list of dataset dicts.
49
+ class_names (list[str]): list of class names (zero-indexed).
50
+ """
51
+ num_classes = len(class_names)
52
+ hist_bins = np.arange(num_classes + 1)
53
+ histogram = np.zeros((num_classes,), dtype=np.int)
54
+ for entry in dataset_dicts:
55
+ classes = np.asarray([entry["category_id"]], dtype=np.int)
56
+ if len(classes):
57
+ assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}"
58
+ assert (
59
+ classes.max() < num_classes
60
+ ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes"
61
+ histogram += np.histogram(classes, bins=hist_bins)[0]
62
+
63
+ N_COLS = min(6, len(class_names) * 2)
64
+
65
+ def short_name(x):
66
+ # make long class names shorter. useful for lvis
67
+ if len(x) > 13:
68
+ return x[:11] + ".."
69
+ return x
70
+
71
+ data = list(
72
+ itertools.chain(
73
+ *[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)]
74
+ )
75
+ )
76
+ total_num_instances = sum(data[1::2])
77
+ data.extend([None] * (N_COLS - (len(data) % N_COLS)))
78
+ if num_classes > 1:
79
+ data.extend(["total", total_num_instances])
80
+ data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
81
+ table = tabulate(
82
+ data,
83
+ headers=["category", "#instances"] * (N_COLS // 2),
84
+ tablefmt="pipe",
85
+ numalign="left",
86
+ stralign="center",
87
+ )
88
+ log_first_n(
89
+ logging.INFO,
90
+ "Distribution of instances among all {} categories:\n".format(num_classes)
91
+ + colored(table, "cyan"),
92
+ key="message",
93
+ )
94
+
95
+
96
+ def wrap_metas(dataset_dict, **kwargs):
97
+ def _assign_attr(data_dict: dict, **kwargs):
98
+ assert not any(
99
+ [key in data_dict for key in kwargs]
100
+ ), "Assigned attributes should not exist in the original sample."
101
+ data_dict.update(kwargs)
102
+ return data_dict
103
+
104
+ return [_assign_attr(sample, meta=kwargs) for sample in dataset_dict]
105
+
106
+
107
+ def get_detection_dataset_dicts(
108
+ names, filter_empty=True, min_keypoints=0, proposal_files=None
109
+ ):
110
+ """
111
+ Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
112
+
113
+ Args:
114
+ names (str or list[str]): a dataset name or a list of dataset names
115
+ filter_empty (bool): whether to filter out images without instance annotations
116
+ min_keypoints (int): filter out images with fewer keypoints than
117
+ `min_keypoints`. Set to 0 to do nothing.
118
+ proposal_files (list[str]): if given, a list of object proposal files
119
+ that match each dataset in `names`.
120
+
121
+ Returns:
122
+ list[dict]: a list of dicts following the standard dataset dict format.
123
+ """
124
+ if isinstance(names, str):
125
+ names = [names]
126
+ assert len(names), names
127
+ dataset_dicts = [
128
+ wrap_metas(DatasetCatalog.get(dataset_name), dataset_name=dataset_name)
129
+ for dataset_name in names
130
+ ]
131
+ for dataset_name, dicts in zip(names, dataset_dicts):
132
+ assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
133
+
134
+ if proposal_files is not None:
135
+ assert len(names) == len(proposal_files)
136
+ # load precomputed proposals from proposal files
137
+ dataset_dicts = [
138
+ load_proposals_into_dataset(dataset_i_dicts, proposal_file)
139
+ for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
140
+ ]
141
+
142
+ dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
143
+
144
+ has_instances = "annotations" in dataset_dicts[0]
145
+ if filter_empty and has_instances:
146
+ dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
147
+ if min_keypoints > 0 and has_instances:
148
+ dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
149
+
150
+ if has_instances:
151
+ try:
152
+ class_names = MetadataCatalog.get(names[0]).thing_classes
153
+ check_metadata_consistency("thing_classes", names)
154
+ print_instances_class_histogram(dataset_dicts, class_names)
155
+ except AttributeError: # class names are not available for this dataset
156
+ pass
157
+
158
+ assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
159
+ return dataset_dicts
160
+
161
+
162
+ def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
163
+ if dataset is None:
164
+ dataset = get_detection_dataset_dicts(
165
+ cfg.DATASETS.TRAIN,
166
+ filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
167
+ min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
168
+ if cfg.MODEL.KEYPOINT_ON
169
+ else 0,
170
+ proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
171
+ if cfg.MODEL.LOAD_PROPOSALS
172
+ else None,
173
+ )
174
+ _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
175
+
176
+ if mapper is None:
177
+ mapper = DatasetMapper(cfg, True)
178
+
179
+ if sampler is None:
180
+ sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
181
+ logger = logging.getLogger(__name__)
182
+ logger.info("Using training sampler {}".format(sampler_name))
183
+ if sampler_name == "TrainingSampler":
184
+ sampler = TrainingSampler(len(dataset))
185
+ elif sampler_name == "RepeatFactorTrainingSampler":
186
+ repeat_factors = (
187
+ RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
188
+ dataset, cfg.DATALOADER.REPEAT_THRESHOLD
189
+ )
190
+ )
191
+ sampler = RepeatFactorTrainingSampler(repeat_factors)
192
+ elif sampler_name == "RandomSubsetTrainingSampler":
193
+ sampler = RandomSubsetTrainingSampler(
194
+ len(dataset), cfg.DATALOADER.RANDOM_SUBSET_RATIO
195
+ )
196
+ else:
197
+ raise ValueError("Unknown training sampler: {}".format(sampler_name))
198
+
199
+ return {
200
+ "dataset": dataset,
201
+ "sampler": sampler,
202
+ "mapper": mapper,
203
+ "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
204
+ "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
205
+ "num_workers": cfg.DATALOADER.NUM_WORKERS,
206
+ }
207
+
208
+
209
+ # TODO can allow dataset as an iterable or IterableDataset to make this function more general
210
+ @configurable(from_config=_train_loader_from_config)
211
+ def build_detection_train_loader(
212
+ dataset,
213
+ *,
214
+ mapper,
215
+ sampler=None,
216
+ total_batch_size,
217
+ aspect_ratio_grouping=True,
218
+ num_workers=0,
219
+ ):
220
+ """
221
+ Build a dataloader for object detection with some default features.
222
+ This interface is experimental.
223
+
224
+ Args:
225
+ dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
226
+ or a map-style pytorch dataset. They can be obtained by using
227
+ :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
228
+ mapper (callable): a callable which takes a sample (dict) from dataset and
229
+ returns the format to be consumed by the model.
230
+ When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
231
+ sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
232
+ indices to be applied on ``dataset``. Default to :class:`TrainingSampler`,
233
+ which coordinates an infinite random shuffle sequence across all workers.
234
+ total_batch_size (int): total batch size across all workers. Batching
235
+ simply puts data into a list.
236
+ aspect_ratio_grouping (bool): whether to group images with similar
237
+ aspect ratio for efficiency. When enabled, it requires each
238
+ element in dataset be a dict with keys "width" and "height".
239
+ num_workers (int): number of parallel data loading workers
240
+
241
+ Returns:
242
+ torch.utils.data.DataLoader:
243
+ a dataloader. Each output from it is a ``list[mapped_element]`` of length
244
+ ``total_batch_size / num_workers``, where ``mapped_element`` is produced
245
+ by the ``mapper``.
246
+ """
247
+ if isinstance(dataset, list):
248
+ dataset = DatasetFromList(dataset, copy=False)
249
+ if mapper is not None:
250
+ dataset = MapDataset(dataset, mapper)
251
+ if sampler is None:
252
+ sampler = TrainingSampler(len(dataset))
253
+ assert isinstance(sampler, torch.utils.data.sampler.Sampler)
254
+ return build_batch_data_loader(
255
+ dataset,
256
+ sampler,
257
+ total_batch_size,
258
+ aspect_ratio_grouping=aspect_ratio_grouping,
259
+ num_workers=num_workers,
260
+ )
261
+
262
+
263
+ def _test_loader_from_config(cfg, dataset_name, mapper=None):
264
+ """
265
+ Uses the given `dataset_name` argument (instead of the names in cfg), because the
266
+ standard practice is to evaluate each test set individually (not combining them).
267
+ """
268
+ if isinstance(dataset_name, str):
269
+ dataset_name = [dataset_name]
270
+
271
+ dataset = get_detection_dataset_dicts(
272
+ dataset_name,
273
+ filter_empty=False,
274
+ proposal_files=[
275
+ cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)]
276
+ for x in dataset_name
277
+ ]
278
+ if cfg.MODEL.LOAD_PROPOSALS
279
+ else None,
280
+ )
281
+ if mapper is None:
282
+ mapper = DatasetMapper(cfg, False)
283
+ return {
284
+ "dataset": dataset,
285
+ "mapper": mapper,
286
+ "num_workers": 0,
287
+ "samples_per_gpu": cfg.SOLVER.TEST_IMS_PER_BATCH,
288
+ }
289
+
290
+
291
+ @configurable(from_config=_test_loader_from_config)
292
+ def build_detection_test_loader(
293
+ dataset, *, mapper, sampler=None, num_workers=0, samples_per_gpu=1
294
+ ):
295
+ """
296
+ Similar to `build_detection_train_loader`, but uses a batch size of 1,
297
+ and :class:`InferenceSampler`. This sampler coordinates all workers to
298
+ produce the exact set of all samples.
299
+ This interface is experimental.
300
+
301
+ Args:
302
+ dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
303
+ or a map-style pytorch dataset. They can be obtained by using
304
+ :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
305
+ mapper (callable): a callable which takes a sample (dict) from dataset
306
+ and returns the format to be consumed by the model.
307
+ When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
308
+ sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
309
+ indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
310
+ which splits the dataset across all workers.
311
+ num_workers (int): number of parallel data loading workers
312
+
313
+ Returns:
314
+ DataLoader: a torch DataLoader, that loads the given detection
315
+ dataset, with test-time transformation and batching.
316
+
317
+ Examples:
318
+ ::
319
+ data_loader = build_detection_test_loader(
320
+ DatasetRegistry.get("my_test"),
321
+ mapper=DatasetMapper(...))
322
+
323
+ # or, instantiate with a CfgNode:
324
+ data_loader = build_detection_test_loader(cfg, "my_test")
325
+ """
326
+ if isinstance(dataset, list):
327
+ dataset = DatasetFromList(dataset, copy=False)
328
+ if mapper is not None:
329
+ dataset = MapDataset(dataset, mapper)
330
+ if sampler is None:
331
+ sampler = InferenceSampler(len(dataset))
332
+ # Always use 1 image per worker during inference since this is the
333
+ # standard when reporting inference time in papers.
334
+ batch_sampler = torch.utils.data.sampler.BatchSampler(
335
+ sampler, samples_per_gpu, drop_last=False
336
+ )
337
+ data_loader = torch.utils.data.DataLoader(
338
+ dataset,
339
+ num_workers=num_workers,
340
+ batch_sampler=batch_sampler,
341
+ collate_fn=trivial_batch_collator,
342
+ )
343
+ return data_loader
344
+
open_vocab_seg/data/dataset_mappers/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
open_vocab_seg/data/dataset_mappers/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (288 Bytes). View file
 
open_vocab_seg/data/dataset_mappers/__pycache__/mask_former_semantic_dataset_mapper.cpython-39.pyc ADDED
Binary file (5.14 kB). View file
 
open_vocab_seg/data/dataset_mappers/mask_former_semantic_dataset_mapper.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
+
4
+ import copy
5
+ import logging
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch.nn import functional as F
10
+
11
+ from detectron2.config import configurable
12
+ from detectron2.data import MetadataCatalog
13
+ from detectron2.data import detection_utils as utils
14
+ from detectron2.data import transforms as T
15
+ from detectron2.projects.point_rend import ColorAugSSDTransform
16
+ from detectron2.structures import BitMasks, Instances
17
+
18
+ __all__ = ["MaskFormerSemanticDatasetMapper"]
19
+
20
+
21
+ class MaskFormerSemanticDatasetMapper:
22
+ """
23
+ A callable which takes a dataset dict in Detectron2 Dataset format,
24
+ and map it into a format used by MaskFormer for semantic segmentation.
25
+
26
+ The callable currently does the following:
27
+
28
+ 1. Read the image from "file_name"
29
+ 2. Applies geometric transforms to the image and annotation
30
+ 3. Find and applies suitable cropping to the image and annotation
31
+ 4. Prepare image and annotation to Tensors
32
+ """
33
+
34
+ @configurable
35
+ def __init__(
36
+ self,
37
+ is_train=True,
38
+ *,
39
+ augmentations,
40
+ image_format,
41
+ ignore_label,
42
+ size_divisibility,
43
+ ):
44
+ """
45
+ NOTE: this interface is experimental.
46
+ Args:
47
+ is_train: for training or inference
48
+ augmentations: a list of augmentations or deterministic transforms to apply
49
+ image_format: an image format supported by :func:`detection_utils.read_image`.
50
+ ignore_label: the label that is ignored to evaluation
51
+ size_divisibility: pad image size to be divisible by this value
52
+ """
53
+ self.is_train = is_train
54
+ self.tfm_gens = augmentations
55
+ self.img_format = image_format
56
+ self.ignore_label = ignore_label
57
+ self.size_divisibility = size_divisibility
58
+
59
+ logger = logging.getLogger(__name__)
60
+ mode = "training" if is_train else "inference"
61
+ logger.info(
62
+ f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}"
63
+ )
64
+
65
+ @classmethod
66
+ def from_config(cls, cfg, is_train=True):
67
+ # Build augmentation
68
+ if is_train:
69
+ augs = [
70
+ T.ResizeShortestEdge(
71
+ cfg.INPUT.MIN_SIZE_TRAIN,
72
+ cfg.INPUT.MAX_SIZE_TRAIN,
73
+ cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
74
+ )
75
+ ]
76
+ if cfg.INPUT.CROP.ENABLED:
77
+ augs.append(
78
+ T.RandomCrop_CategoryAreaConstraint(
79
+ cfg.INPUT.CROP.TYPE,
80
+ cfg.INPUT.CROP.SIZE,
81
+ cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
82
+ cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
83
+ )
84
+ )
85
+ if cfg.INPUT.COLOR_AUG_SSD:
86
+ augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
87
+ augs.append(T.RandomFlip())
88
+
89
+ # Assume always applies to the training set.
90
+ dataset_names = cfg.DATASETS.TRAIN
91
+ else:
92
+ min_size = cfg.INPUT.MIN_SIZE_TEST
93
+ max_size = cfg.INPUT.MAX_SIZE_TEST
94
+ sample_style = "choice"
95
+ augs = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
96
+ dataset_names = cfg.DATASETS.TEST
97
+ meta = MetadataCatalog.get(dataset_names[0])
98
+ ignore_label = meta.ignore_label
99
+
100
+ ret = {
101
+ "is_train": is_train,
102
+ "augmentations": augs,
103
+ "image_format": cfg.INPUT.FORMAT,
104
+ "ignore_label": ignore_label,
105
+ "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY if is_train else -1,
106
+ }
107
+ return ret
108
+
109
+ def __call__(self, dataset_dict):
110
+ """
111
+ Args:
112
+ dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
113
+
114
+ Returns:
115
+ dict: a format that builtin models in detectron2 accept
116
+ """
117
+ # assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
118
+
119
+ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
120
+ image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
121
+ utils.check_image_size(dataset_dict, image)
122
+
123
+ if "sem_seg_file_name" in dataset_dict:
124
+ # PyTorch transformation not implemented for uint16, so converting it to double first
125
+ sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype(
126
+ "double"
127
+ )
128
+ else:
129
+ sem_seg_gt = None
130
+
131
+ if sem_seg_gt is None:
132
+ raise ValueError(
133
+ "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
134
+ dataset_dict["file_name"]
135
+ )
136
+ )
137
+
138
+ aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
139
+ aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
140
+ image = aug_input.image
141
+ sem_seg_gt = aug_input.sem_seg
142
+
143
+ # Pad image and segmentation label here!
144
+ image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
145
+ if sem_seg_gt is not None:
146
+ sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
147
+
148
+ if self.size_divisibility > 0:
149
+ image_size = (image.shape[-2], image.shape[-1])
150
+ padding_size = [
151
+ 0,
152
+ self.size_divisibility - image_size[1],
153
+ 0,
154
+ self.size_divisibility - image_size[0],
155
+ ]
156
+ image = F.pad(image, padding_size, value=128).contiguous()
157
+ if sem_seg_gt is not None:
158
+ sem_seg_gt = F.pad(
159
+ sem_seg_gt, padding_size, value=self.ignore_label
160
+ ).contiguous()
161
+
162
+ image_shape = (image.shape[-2], image.shape[-1]) # h, w
163
+
164
+ # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
165
+ # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
166
+ # Therefore it's important to use torch.Tensor.
167
+ dataset_dict["image"] = image
168
+
169
+ if sem_seg_gt is not None:
170
+ dataset_dict["sem_seg"] = sem_seg_gt.long()
171
+
172
+ if "annotations" in dataset_dict:
173
+ raise ValueError(
174
+ "Semantic segmentation dataset should not have 'annotations'."
175
+ )
176
+
177
+ # Prepare per-category binary masks
178
+ if sem_seg_gt is not None:
179
+ sem_seg_gt = sem_seg_gt.numpy()
180
+ instances = Instances(image_shape)
181
+ classes = np.unique(sem_seg_gt)
182
+ # remove ignored region
183
+ classes = classes[classes != self.ignore_label]
184
+ instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
185
+
186
+ masks = []
187
+ for class_id in classes:
188
+ masks.append(sem_seg_gt == class_id)
189
+
190
+ if len(masks) == 0:
191
+ # Some image does not have annotation (all ignored)
192
+ instances.gt_masks = torch.zeros(
193
+ (0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])
194
+ )
195
+ else:
196
+ masks = BitMasks(
197
+ torch.stack(
198
+ [
199
+ torch.from_numpy(np.ascontiguousarray(x.copy()))
200
+ for x in masks
201
+ ]
202
+ )
203
+ )
204
+ instances.gt_masks = masks.tensor
205
+
206
+ dataset_dict["instances"] = instances
207
+
208
+ return dataset_dict
open_vocab_seg/data/datasets/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from . import register_coco_stuff, register_voc_seg
3
+ from . import register_cc3m
4
+ from . import register_ade20k_full
5
+ from . import register_pascal_context
open_vocab_seg/data/datasets/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (386 Bytes). View file
 
open_vocab_seg/data/datasets/__pycache__/register_ade20k_full.cpython-39.pyc ADDED
Binary file (37 kB). View file
 
open_vocab_seg/data/datasets/__pycache__/register_cc3m.cpython-39.pyc ADDED
Binary file (17.7 kB). View file
 
open_vocab_seg/data/datasets/__pycache__/register_coco_stuff.cpython-39.pyc ADDED
Binary file (9.49 kB). View file
 
open_vocab_seg/data/datasets/__pycache__/register_pascal_context.cpython-39.pyc ADDED
Binary file (6.53 kB). View file
 
open_vocab_seg/data/datasets/__pycache__/register_voc_seg.cpython-39.pyc ADDED
Binary file (1.49 kB). View file