diff --git a/README.md b/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4f65397694faa1ab3a13b9ab8e5b740cfa92b5ca 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,108 @@ +
+ +## Open-YOLO 3D: Towards Fast and Accurate Open-Vocabulary 3D Instance Segmentation +
+ +
+ +
+ +
+Mohamed El Amine Boudjoghra1, Angela Dai2, Jean Lahoud1, Hisham Cholakkal1, Rao Muhammad Anwer1,3, Salman Khan1,4, Fahad Khan1,5 + +1Mohamed Bin Zayed University of Artificial Intelligence (MBZUAI) 2Technical University of Munich (TUM) 3Aalto University 4Australian National University 5Linköping University +
+ + +
+ +![paper](https://img.shields.io/badge/arXiv-Paper-.svg) + + +
+ + + +### News + +* **30 May 2024**: [Open-YOLO 3D](https://arxiv.org/abs/2406.02548) released on arXiv. 📝 +* **30 May 2024**: Code released. 💻 + +### Abstract + + Recent works on open-vocabulary 3D instance segmentation show strong promise, but at the cost of slow inference speed and high computation requirements. This high computation cost is typically due to their heavy reliance on 3D clip features, which require computationally expensive 2D foundation models like Segment Anything (SAM) and CLIP for multi-view aggregation into 3D. As a consequence, this hampers their applicability in many real-world applications that require both fast and accurate predictions. To this end, we propose a fast yet accurate open-vocabulary 3D instance segmentation approach, named Open-YOLO 3D, that effectively leverages only 2D object detection from multi-view RGB images for open-vocabulary 3D instance segmentation. + We address this task by generating class-agnostic 3D masks for objects in the scene and associating them with text prompts. + We observe that the projection of class-agnostic 3D point cloud instances already holds instance information; thus, using SAM might only result in redundancy that unnecessarily increases the inference time. +We empirically find that a better performance of matching text prompts to 3D masks can be achieved in a faster fashion with a 2D object detector. We validate our Open-YOLO 3D on two benchmarks, ScanNet200 and Replica, + under two scenarios: (i) with ground truth masks, where labels are required for given object proposals, and (ii) with class-agnostic 3D proposals generated from a 3D proposal network. Our Open-YOLO 3D achieves state-of-the-art performance on both datasets while obtaining up to 16x speedup compared to the best existing method in literature. On ScanNet200 val. set, our Open-YOLO 3D achieves mean average precision (mAP) of 24.7% while operating at 22 seconds per scene. + +### Qualitative results +
+ +
+ +
+ + +## Installation guide + +Kindly check [Installation guide](./docs/Installation.md) on how to setup the Conda environment and to download the checkpoints, the pre-computed class agnostic masks, and the ground truth masks. + +## Data Preparation + +Kindly check [Data Preparation guide](./docs/Data_prep.md) on how to prepare ScanNet200 and Replica datasets. + +## Results reproducibility + +Kindly use the pre-computed class agnostic masks we shared to reproduce the exact numbers we reported in the paper. + +**Reproduce the results of ScanNet200 with precomputed-masks (using Mask3D)** +``` +python run_evaluation.py --dataset_name scannet200 --path_to_3d_masks "./output/scannet200/scannet200_masks" +``` +**Reproduce the results of ScanNet200 with oracle 3D masks (ground truth 3D masks)** +``` +python run_evaluation.py --dataset_name scannet200 --path_to_3d_masks "./output/scannet200/scannet200_ground_truth_masks" --is_gt +``` +**Reproduce the results of Replica with precomputed-masks (using Mask3D)** +``` +python run_evaluation.py --dataset_name replica --path_to_3d_masks "./output/replica/replica_masks" +``` +**Reproduce the results of Replica with oracle 3D masks (ground truth 3D masks)** +``` +python run_evaluation.py --dataset_name replica --path_to_3d_masks "./output/replica/replica_ground_truth_masks" --is_gt +``` + +You can evaluate without our 3D class-agnostic masks, but this may lead to variability in results due to elements like furthest point sampling that cause randomness in predictions from Mask3D. For consistent results with the ones we report in the paper, we recommend using our pre-computed masks. + +**Reproduce the results of Replica or ScanNet200 without using our pre-computed masks** +``` +python run_evaluation.py --dataset_name $DATASET_NAME +``` + +## Single scene inference + +``` +from utils import OpenYolo3D + +openyolo3d = OpenYolo3D("$(pwd)/pretrained/config.yaml") #Initialize the model, define the text prompts in the config. +prediction = openyolo3d.predict("$(pwd)/data/replica/office0", 6553.5) #Predict the instance masks and labels (takes around 20 seconds in total). +openyolo3d.save_output_as_ply("$(pwd)/sample/output.ply", True) # Save the ply file for visualization, you can use meshlab to visualize the output scene +``` + +## Acknoledgments +We would like to thank the authors of Mask3D and YoloWorld for their works which were used for our model. + + +## BibTeX :pray: +``` +@misc{boudjoghra2024openyolo, + title={Open-YOLO 3D: Towards Fast and Accurate Open-Vocabulary 3D Instance Segmentation}, + author={Mohamed El Amine Boudjoghra and Angela Dai and Jean Lahoud and Hisham Cholakkal and Rao Muhammad Anwer and Salman Khan and Fahad Shahbaz Khan}, + year={2024}, + eprint={2406.02548}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..1ce639c16688b6ced1b83624da4a391a43ccf30e --- /dev/null +++ b/environment.yml @@ -0,0 +1,216 @@ +name: openyolo3d +channels: + - anaconda + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - blas=1.0=openblas + - boltons=23.0.0=py310h06a4308_0 + - brotlipy=0.7.0=py310h7f8727e_1002 + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2023.01.10=h06a4308_0 + - certifi=2022.12.7=py310h06a4308_0 + - cffi=1.15.1=py310h5eee18b_3 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - conda=23.3.1=py310h06a4308_0 + - conda-content-trust=0.1.3=py310h06a4308_0 + - conda-package-handling=2.0.2=py310h06a4308_0 + - conda-package-streaming=0.7.0=py310h06a4308_0 + - cryptography=39.0.1=py310h9ce1e76_0 + - idna=3.4=py310h06a4308_0 + - jsonpatch=1.32=pyhd3eb1b0_0 + - jsonpointer=2.1=pyhd3eb1b0_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libffi=3.4.2=h6a678d5_6 + - libgcc-ng=11.2.0=h1234567_1 + - libgfortran-ng=11.2.0=h00389a5_1 + - libgfortran5=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libopenblas=0.3.21=h043d6bf_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - ncurses=6.4=h6a678d5_0 + - nomkl=3.0=0 + - openblas-devel=0.3.21=h06a4308_0 + - openssl=1.1.1s=h7f8727e_0 + - packaging=23.0=py310h06a4308_0 + - pluggy=1.0.0=py310h06a4308_1 + - pycosat=0.6.4=py310h5eee18b_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pyopenssl=23.0.0=py310h06a4308_0 + - pysocks=1.7.1=py310h06a4308_0 + - python=3.10.9=h7a1cb2a_0 + - readline=8.2=h5eee18b_0 + - requests=2.28.1=py310h06a4308_1 + - ruamel.yaml=0.17.21=py310h5eee18b_0 + - ruamel.yaml.clib=0.2.6=py310h5eee18b_1 + - setuptools=65.6.3=py310h06a4308_0 + - six=1.16.0=pyhd3eb1b0_1 + - sqlite=3.41.2=h5eee18b_0 + - tk=8.6.12=h1ccaba5_0 + - toolz=0.12.0=py310h06a4308_0 + - tqdm=4.65.0=py310h2f386ee_0 + - urllib3=1.26.15=py310h06a4308_0 + - wheel=0.37.1=pyhd3eb1b0_0 + - xz=5.2.10=h5eee18b_1 + - zlib=1.2.13=h5eee18b_0 + - zstandard=0.19.0=py310h5eee18b_0 + - pip + - pip: + - absl-py==1.4.0 + - addict==2.4.0 + - aiohttp==3.8.4 + - aiosignal==1.3.1 + # - albumentations==1.2.1 #manual + - antlr4-python3-runtime==4.8 + - anyio==3.6.2 + - appdirs==1.4.4 + - asttokens==2.2.1 + - async-timeout==4.0.2 + - attrs==23.1.0 + - backcall==0.2.0 + - black==21.4b2 + - cachetools==5.3.0 + - click==8.1.3 + - cloudpickle==2.1.0 + - comm==0.1.3 + - configargparse==1.5.3 + - contourpy==1.0.7 + - cycler==0.11.0 + - dash==2.9.3 + - dash-core-components==2.0.0 + - dash-html-components==2.0.0 + - dash-table==5.0.0 + - debugpy==1.6.7 + - decorator==5.1.1 + # - detectron2==0.6 + - docker-pycreds==0.4.0 + - executing==1.2.0 + - fastapi==0.95.1 + - fastjsonschema==2.16.3 + - fire==0.4.0 + - flake8==6.0.0 + - flask==2.2.3 + - fonttools==4.39.3 + - frozenlist==1.3.3 + - fsspec==2023.4.0 + # - fvcore==0.1.5.post20220512 #manual + - gitdb==4.0.10 + - gitpython==3.1.31 + - google-auth==2.17.3 + - google-auth-oauthlib==1.0.0 + - grpcio==1.54.0 + - h11==0.14.0 + - hydra-core==1.0.5 + - imageio==2.21.1 + - importlib-metadata==3.10.1 + - iopath==0.1.10 + - ipykernel==6.22.0 + - ipython==8.12.0 + - ipywidgets==8.0.6 + - itsdangerous==2.1.2 + - jedi==0.18.2 + - jinja2==3.1.2 + - joblib==1.2.0 + - jsonschema==4.17.3 + - jupyter-client==8.2.0 + - jupyter-core==5.3.0 + - jupyterlab-widgets==3.0.7 + - kiwisolver==1.4.4 + - lazy-loader==0.2 + - loguru==0.6.0 + - markdown==3.4.3 + - markupsafe==2.1.2 + - matplotlib==3.7.1 + - matplotlib-inline==0.1.6 + # - minkowskiengine==0.5.4 + - multidict==6.0.4 + - mypy-extensions==1.0.0 + - natsort==8.3.1 + - nbformat==5.7.0 + - nest-asyncio==1.5.6 + - networkx==3.1 + - ninja==1.10.2.3 + - numpy==1.24.2 + - oauthlib==3.2.2 + # - omegaconf==2.0.6 #manual + # - open3d==0.17.0 #manual + - opencv-python-headless==4.7.0.72 + - pandas==2.0.0 + - parso==0.8.3 + - pathspec==0.11.1 + - pathtools==0.1.2 + - pexpect==4.8.0 + - pickleshare==0.7.5 + - pillow==9.5.0 + - pip==23.1 + - platformdirs==3.2.0 + - plotly==5.14.1 + - plyfile==0.7.4 + # - pointnet2==0.0.0 + - portalocker==2.7.0 + - prompt-toolkit==3.0.38 + - protobuf==4.22.3 + - psutil==5.9.5 + - ptyprocess==0.7.0 + - pure-eval==0.2.2 + - pyasn1==0.5.0 + - pyasn1-modules==0.3.0 + - pycocotools==2.0.4 + - pydantic==1.10.7 + - pydeprecate==0.3.2 + - pygments==2.15.1 + - pyparsing==3.0.9 + - pyquaternion==0.9.9 + - pyrsistent==0.19.3 + - python-dateutil==2.8.2 + - python-dotenv==0.20.0 + - python-multipart==0.0.6 + # - pytorch-lightning==1.7.2 + - pytz==2023.3 + - pyviz3d==0.2.28 + - pywavelets==1.4.1 + - pyyaml==5.3.1 + - pyzmq==25.0.2 + - qudida==0.0.4 + - regex==2023.3.23 + - requests-oauthlib==1.3.1 + - rsa==4.9 + - scikit-image==0.20.0 + - scikit-learn==1.1.2 + - scipy==1.9.0 + - sentry-sdk==1.20.0 + - setproctitle==1.3.2 + - smmap==5.0.0 + - sniffio==1.3.0 + - stack-data==0.6.2 + - starlette==0.26.1 + - tabulate==0.9.0 + - tenacity==8.2.2 + - tensorboard==2.12.2 + - tensorboard-data-server==0.7.0 + - tensorboard-plugin-wit==1.8.1 + - termcolor==2.2.0 + - threadpoolctl==3.1.0 + - tifffile==2023.4.12 + - toml==0.10.2 + # - torch==1.12.1+cu113 + # - torch-scatter==2.1.1 + # - torchmetrics==0.11.4 + # - torchvision==0.13.1+cu113 + - tornado==6.3 + - traitlets==5.9.0 + - trimesh==3.14.0 + - typing-extensions==4.5.0 + - tzdata==2023.3 + - uvicorn==0.21.1 + - volumentations==0.1.8 + - wandb==0.15.0 + - wcwidth==0.2.6 + - werkzeug==2.2.3 + - widgetsnbextension==4.0.7 + - yacs==0.1.8 + - yarl==1.8.2 + - zipp==3.15.0 +prefix: /opt/conda diff --git a/models/Mask3D/LICENSE b/models/Mask3D/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e619d905e048f45390e27e6fc2d93b6e96f1ea3b --- /dev/null +++ b/models/Mask3D/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2022 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/models/Mask3D/MANIFEST.in b/models/Mask3D/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..9ead0b59b546d425aeac6e46dba4278ef87eb3a7 --- /dev/null +++ b/models/Mask3D/MANIFEST.in @@ -0,0 +1 @@ +recursive-include mask3d/conf *.yaml \ No newline at end of file diff --git a/models/Mask3D/README.md b/models/Mask3D/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e02d4a639970a937a899274858682e85b2a33de8 --- /dev/null +++ b/models/Mask3D/README.md @@ -0,0 +1,289 @@ +# Packaged version of Mask3D to be used in LabelMaker + +## Installation + +``` +# Some users experienced issues on Ubuntu with an AMD CPU +# Install libopenblas-dev (issue #115, thanks WindWing) +# sudo apt-get install libopenblas-dev + +export TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6" + +conda env create -f environment.yml + +conda activate mask3d_cuda113 + +pip3 install torch==1.12.1+cu113 torchvision==0.13.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 +pip3 install torch-scatter -f https://data.pyg.org/whl/torch-1.12.1+cu113.html +pip3 install 'git+https://github.com/facebookresearch/detectron2.git@710e7795d0eeadf9def0e7ef957eea13532e34cf' --no-deps + +mkdir third_party +cd third_party + +git clone --recursive "https://github.com/NVIDIA/MinkowskiEngine" +cd MinkowskiEngine +git checkout 02fc608bea4c0549b0a7b00ca1bf15dee4a0b228 +python setup.py install --force_cuda --blas=openblas + +cd .. +git clone https://github.com/ScanNet/ScanNet.git +cd ScanNet/Segmentator +git checkout 3e5726500896748521a6ceb81271b0f5b2c0e7d2 +make + +cd third_party/pointnet2 +python setup.py install + +cd ../../ +pip3 install pytorch-lightning==1.7.2 + +pip install . + +``` + +To use the model in your code you need to download a checkpoint from the list below. +Afterwards, the basic model can be used like: + + +```python +from mask3d import get_model + +model = get_model(checkpoint_path='checkpoints/scannet200/scannet200_benchmark.ckpt') +``` + + +Here is a minimal example assuming you have a pointcloud in the folder data. + +```python + +from mask3d import get_model, load_mesh, prepare_data, map_output_to_pointcloud, save_colorized_mesh + +model = get_model('checkpoints/scannet200/scannet200_benchmark.ckpt') +model.eval() +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# load input data +pointcloud_file = 'data/pcl.ply' +mesh = load_mesh(pointcloud_file) + +# prepare data +data, points, colors, features, unique_map, inverse_map = prepare_data(mesh, device) + +# run model +with torch.no_grad(): + outputs = model(data, raw_coordinates=features) + +# map output to point cloud +labels = map_output_to_pointcloud(mesh, outputs, inverse_map) + +# save colorized mesh +save_colorized_mesh(mesh, labels, 'data/pcl_labelled.ply', colormap='scannet200') +``` + +So far, only Scannet200 checkpoints are supported. We are working on the ScanNet checkpoints. + +# Original Information + +## Mask3D: Mask Transformer for 3D Instance Segmentation +
+Jonas Schult1, Francis Engelmann2,3, Alexander Hermans1, Or Litany4, Siyu Tang3, Bastian Leibe1 + +1RWTH Aachen University 2ETH AI Center 3ETH Zurich 4NVIDIA + +Mask3D predicts accurate 3D semantic instances achieving state-of-the-art on ScanNet, ScanNet200, S3DIS and STPLS3D. + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/mask3d-for-3d-semantic-instance-segmentation/3d-instance-segmentation-on-scannetv2)](https://paperswithcode.com/sota/3d-instance-segmentation-on-scannetv2?p=mask3d-for-3d-semantic-instance-segmentation) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/mask3d-for-3d-semantic-instance-segmentation/3d-instance-segmentation-on-scannet200)](https://paperswithcode.com/sota/3d-instance-segmentation-on-scannet200?p=mask3d-for-3d-semantic-instance-segmentation) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/mask3d-for-3d-semantic-instance-segmentation/3d-instance-segmentation-on-s3dis)](https://paperswithcode.com/sota/3d-instance-segmentation-on-s3dis?p=mask3d-for-3d-semantic-instance-segmentation) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/mask3d-for-3d-semantic-instance-segmentation/3d-instance-segmentation-on-stpls3d)](https://paperswithcode.com/sota/3d-instance-segmentation-on-stpls3d?p=mask3d-for-3d-semantic-instance-segmentation) + +PyTorch +Lightning +Config: Hydra + +![teaser](./docs/teaser.jpg) + +
+

+ +[[Project Webpage](https://jonasschult.github.io/Mask3D/)] +[[Paper](https://arxiv.org/abs/2210.03105)] +[[Demo](https://francisengelmann.github.io/mask3d/)] + + +## News + +* **17. January 2023**: Mask3D is accepted at ICRA 2023. :fire: +* **14. October 2022**: STPLS3D support added. +* **10. October 2022**: Mask3D ranks 2nd on the [STPLS3D Challenge](https://codalab.lisn.upsaclay.fr/competitions/4646#results) hosted by the [Urban3D Workshop](https://urban3dchallenge.github.io/) at ECCV 2022. +* **6. October 2022**: [Mask3D preprint](https://arxiv.org/abs/2210.03105) released on arXiv. +* **25. September 2022**: Code released. + +## Code structure +We adapt the codebase of [Mix3D](https://github.com/kumuji/mix3d) which provides a highly modularized framework for 3D Semantic Segmentation based on the MinkowskiEngine. + +``` +├── mix3d +│ ├── main_instance_segmentation.py <- the main file +│ ├── conf <- hydra configuration files +│ ├── datasets +│ │ ├── preprocessing <- folder with preprocessing scripts +│ │ ├── semseg.py <- indoor dataset +│ │ └── utils.py +│ ├── models <- Mask3D modules +│ ├── trainer +│ │ ├── __init__.py +│ │ └── trainer.py <- train loop +│ └── utils +├── data +│ ├── processed <- folder for preprocessed datasets +│ └── raw <- folder for raw datasets +├── scripts <- train scripts +├── docs +├── README.md +└── saved <- folder that stores models and logs +``` + +### Dependencies :memo: +The main dependencies of the project are the following: +```yaml +python: 3.10.9 +cuda: 11.3 +``` +You can set up a conda environment as follows +``` +# Some users experienced issues on Ubuntu with an AMD CPU +# Install libopenblas-dev (issue #115, thanks WindWing) +# sudo apt-get install libopenblas-dev + +export TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6" + +conda env create -f environment.yml + +conda activate mask3d_cuda113 + +pip3 install torch==1.12.1+cu113 torchvision==0.13.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113 +pip3 install torch-scatter -f https://data.pyg.org/whl/torch-1.12.1+cu113.html +pip3 install 'git+https://github.com/facebookresearch/detectron2.git@710e7795d0eeadf9def0e7ef957eea13532e34cf' --no-deps + +mkdir third_party +cd third_party + +git clone --recursive "https://github.com/NVIDIA/MinkowskiEngine" +cd MinkowskiEngine +git checkout 02fc608bea4c0549b0a7b00ca1bf15dee4a0b228 +python setup.py install --force_cuda --blas=openblas + +cd .. +git clone https://github.com/ScanNet/ScanNet.git +cd ScanNet/Segmentator +git checkout 3e5726500896748521a6ceb81271b0f5b2c0e7d2 +make + +cd ../../pointnet2 +python setup.py install + +cd ../../ +pip3 install pytorch-lightning==1.7.2 +``` + +### Data preprocessing :hammer: +After installing the dependencies, we preprocess the datasets. + +#### ScanNet / ScanNet200 +First, we apply Felzenswalb and Huttenlocher's Graph Based Image Segmentation algorithm to the test scenes using the default parameters. +Please refer to the [original repository](https://github.com/ScanNet/ScanNet/tree/master/Segmentator) for details. +Put the resulting segmentations in `./data/raw/scannet_test_segments`. +``` +python -m datasets.preprocessing.scannet_preprocessing preprocess \ +--data_dir="PATH_TO_RAW_SCANNET_DATASET" \ +--save_dir="data/processed/scannet" \ +--git_repo="PATH_TO_SCANNET_GIT_REPO" \ +--scannet200=false/true +``` + +#### S3DIS +The S3DIS dataset contains some smalls bugs which we initially fixed manually. We will soon release a preprocessing script which directly preprocesses the original dataset. For the time being, please follow the instructions [here](https://github.com/JonasSchult/Mask3D/issues/8#issuecomment-1279535948) to fix the dataset manually. Afterwards, call the preprocessing script as follows: + +``` +python -m datasets.preprocessing.s3dis_preprocessing preprocess \ +--data_dir="PATH_TO_Stanford3dDataset_v1.2" \ +--save_dir="data/processed/s3dis" +``` + +#### STPLS3D +``` +python -m datasets.preprocessing.stpls3d_preprocessing preprocess \ +--data_dir="PATH_TO_STPLS3D" \ +--save_dir="data/processed/stpls3d" +``` + +### Training and testing :train2: +Train Mask3D on the ScanNet dataset: +```bash +python main_instance_segmentation.py +``` +Please refer to the [config scripts](https://github.com/JonasSchult/Mask3D/tree/main/scripts) (for example [here](https://github.com/JonasSchult/Mask3D/blob/main/scripts/scannet/scannet_val.sh#L15)) for detailed instructions how to reproduce our results. +In the simplest case the inference command looks as follows: +```bash +python main_instance_segmentation.py \ +general.checkpoint='PATH_TO_CHECKPOINT.ckpt' \ +general.train_mode=false +``` + +## Trained checkpoints :floppy_disk: +We provide detailed scores and network configurations with trained checkpoints. + +### [S3DIS](http://buildingparser.stanford.edu/dataset.html) (pretrained on ScanNet train+val) +Following PointGroup, HAIS and SoftGroup, we finetune a model pretrained on ScanNet ([config](./scripts/scannet/scannet_pretrain_for_s3dis.sh) and [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/scannet_pretrained/scannet_pretrained.ckpt)). +| Dataset | AP | AP_50 | AP_25 | Config | Checkpoint :floppy_disk: | Scores :chart_with_upwards_trend: | Visualizations :telescope: +|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +| Area 1 | 69.3 | 81.9 | 87.7 | [config](scripts/s3dis/s3dis_pretrained.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/scannet_pretrained/area1_scannet_pretrained.ckpt) | [scores](./docs/detailed_scores/s3dis/scannet_pretrained/s3dis_area1_scannet_pretrained.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/scannet_pretrained/area_1/) +| Area 2 | 44.0 | 59.5 | 66.5 | [config](scripts/s3dis/s3dis_pretrained.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/scannet_pretrained/area2_scannet_pretrained.ckpt) | [scores](./docs/detailed_scores/s3dis/scannet_pretrained/s3dis_area2_scannet_pretrained.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/scannet_pretrained/area_2/) +| Area 3 | 73.4 | 83.2 | 88.2 | [config](scripts/s3dis/s3dis_pretrained.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/scannet_pretrained/area3_scannet_pretrained.ckpt) | [scores](./docs/detailed_scores/s3dis/scannet_pretrained/s3dis_area3_scannet_pretrained.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/scannet_pretrained/area_3/) +| Area 4 | 58.0 | 69.5 | 74.9 | [config](scripts/s3dis/s3dis_pretrained.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/scannet_pretrained/area4_scannet_pretrained.ckpt) | [scores](./docs/detailed_scores/s3dis/scannet_pretrained/s3dis_area4_scannet_pretrained.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/scannet_pretrained/area_4/) +| Area 5 | 57.8 | 71.9 | 77.2 | [config](scripts/s3dis/s3dis_pretrained.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/scannet_pretrained/area5_scannet_pretrained.ckpt) | [scores](./docs/detailed_scores/s3dis/scannet_pretrained/s3dis_area5_scannet_pretrained.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/scannet_pretrained/area_5/) +| Area 6 | 68.4 | 79.9 | 85.2 | [config](scripts/s3dis/s3dis_pretrained.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/scannet_pretrained/area6_scannet_pretrained.ckpt) | [scores](./docs/detailed_scores/s3dis/scannet_pretrained/s3dis_area6_scannet_pretrained.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/scannet_pretrained/area_6/) + +### [S3DIS](http://buildingparser.stanford.edu/dataset.html) (from scratch) + +| Dataset | AP | AP_50 | AP_25 | Config | Checkpoint :floppy_disk: | Scores :chart_with_upwards_trend: | Visualizations :telescope: +|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +| Area 1 | 74.1 | 85.1 | 89.6 | [config](scripts/s3dis/s3dis_from_scratch.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/from_scratch/area1_from_scratch.ckpt) | [scores](./docs/detailed_scores/s3dis/from_scratch/s3dis_area1_from_scratch.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/from_scratch/area_1/) +| Area 2 | 44.9 | 57.1 | 67.9 | [config](scripts/s3dis/s3dis_from_scratch.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/from_scratch/area2_from_scratch.ckpt) | [scores](./docs/detailed_scores/s3dis/from_scratch/s3dis_area2_from_scratch.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/from_scratch/area_2/) +| Area 3 | 74.4 | 84.4 | 88.1 | [config](scripts/s3dis/s3dis_from_scratch.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/from_scratch/area3_from_scratch.ckpt) | [scores](./docs/detailed_scores/s3dis/from_scratch/s3dis_area3_from_scratch.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/from_scratch/area_3/) +| Area 4 | 63.8 | 74.7 | 81.1 | [config](scripts/s3dis/s3dis_from_scratch.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/from_scratch/area4_from_scratch.ckpt) | [scores](./docs/detailed_scores/s3dis/from_scratch/s3dis_area4_from_scratch.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/from_scratch/area_4/) +| Area 5 | 56.6 | 68.4 | 75.2 | [config](scripts/s3dis/s3dis_from_scratch.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/from_scratch/area5_from_scratch.ckpt) | [scores](./docs/detailed_scores/s3dis/from_scratch/s3dis_area5_from_scratch.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/from_scratch/area_5/) +| Area 6 | 73.3 | 83.4 | 87.8 | [config](scripts/s3dis/s3dis_from_scratch.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/s3dis/from_scratch/area6_from_scratch.ckpt) | [scores](./docs/detailed_scores/s3dis/from_scratch/s3dis_area6_from_scratch.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/s3dis/from_scratch/area_6/) + +### [ScanNet v2](https://kaldir.vc.in.tum.de/scannet_benchmark/semantic_instance_3d?metric=ap) + +| Dataset | AP | AP_50 | AP_25 | Config | Checkpoint :floppy_disk: | Scores :chart_with_upwards_trend: | Visualizations :telescope: +|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +| ScanNet val | 55.2 | 73.7 | 83.5 | [config](scripts/scannet/scannet_val.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/scannet/scannet_val.ckpt) | [scores](./docs/detailed_scores/scannet_val.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/scannet/val/) +| ScanNet test | 56.6 | 78.0 | 87.0 | [config](scripts/scannet/scannet_benchmark.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/scannet/scannet_benchmark.ckpt) | [scores](http://kaldir.vc.in.tum.de/scannet_benchmark/result_details?id=1081) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/scannet/test/) + +### [ScanNet 200](https://kaldir.vc.in.tum.de/scannet_benchmark/scannet200_semantic_instance_3d) + +| Dataset | AP | AP_50 | AP_25 | Config | Checkpoint :floppy_disk: | Scores :chart_with_upwards_trend: | Visualizations :telescope: +|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +| ScanNet200 val | 27.4 | 37.0 | 42.3 | [config](scripts/scannet200/scannet200_val.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/scannet200/scannet200_val.ckpt) | [scores](./docs/detailed_scores/scannet200_val.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/scannet200/val/) +| ScanNet200 test | 27.8 | 38.8 | 44.5 | [config](scripts/scannet200/scannet200_benchmark.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/scannet200/scannet200_benchmark.ckpt) | [scores](https://kaldir.vc.in.tum.de/scannet_benchmark/result_details?id=1242) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/scannet200/test/) + +### [STPLS3D](https://www.stpls3d.com/) + +| Dataset | AP | AP_50 | AP_25 | Config | Checkpoint :floppy_disk: | Scores :chart_with_upwards_trend: | Visualizations :telescope: +|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| +| STPLS3D val | 57.3 | 74.3 | 81.6 | [config](scripts/stpls3d/stpls3d_val.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/stpls3d/stpls3d_val.ckpt) | [scores](./docs/detailed_scores/stpls3d.txt) | [visualizations](https://omnomnom.vision.rwth-aachen.de/data/mask3d/visualizations/stpls3d/) +| STPLS3D test | 63.4 | 79.2 | 85.6 | [config](scripts/stpls3d/stpls3d_benchmark.sh) | [checkpoint](https://omnomnom.vision.rwth-aachen.de/data/mask3d/checkpoints/stpls3d/stpls3d_benchmark.zip) | [scores](https://codalab.lisn.upsaclay.fr/competitions/4646#results) | visualizations + +## BibTeX :pray: +``` +@article{Schult23ICRA, + title = {{Mask3D: Mask Transformer for 3D Semantic Instance Segmentation}}, + author = {Schult, Jonas and Engelmann, Francis and Hermans, Alexander and Litany, Or and Tang, Siyu and Leibe, Bastian}, + booktitle = {{International Conference on Robotics and Automation (ICRA)}}, + year = {2023} +} +``` diff --git a/models/Mask3D/__init__.py b/models/Mask3D/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/__init__.py b/models/Mask3D/build/lib/mask3d/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b01a17620598f366cfa55c36a48609b1f0075f6 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/__init__.py @@ -0,0 +1,216 @@ +import hydra +import torch + +from mask3d.models.mask3d import Mask3D +from mask3d.utils.utils import ( + load_checkpoint_with_missing_or_exsessive_keys, + load_backbone_checkpoint_with_missing_or_exsessive_keys, +) + +class InstanceSegmentation(torch.nn.Module): + def __init__(self, cfg): + super().__init__() + self.model = hydra.utils.instantiate(cfg.model) + + + def forward(self, x, raw_coordinates=None, point2segment=None): + return self.model(x, raw_coordinates=raw_coordinates, point2segment=point2segment) + + +from omegaconf import OmegaConf, DictConfig +import hydra +from hydra.core.global_hydra import GlobalHydra +from hydra.experimental import initialize, compose + +# imports for input loading +import albumentations as A +import MinkowskiEngine as ME +import numpy as np +import open3d as o3d + +# imports for output +from mask3d.datasets.scannet200.scannet200_constants import (VALID_CLASS_IDS_20, VALID_CLASS_IDS_200, SCANNET_COLOR_MAP_20, SCANNET_COLOR_MAP_200) + +def get_model(checkpoint_path=None, dataset_name = "scannet200"): + + + # Initialize the directory with config files + with initialize(config_path="conf"): + # Compose a configuration + cfg = compose(config_name="config_base_instance_segmentation.yaml") + + cfg.general.checkpoint = checkpoint_path + + # would be nicd to avoid this hardcoding below + # dataset_name = checkpoint_path.split('/')[-1].split('_')[0] + if dataset_name == 'scannet200': + cfg.general.num_targets = 201 + cfg.general.train_mode = False + cfg.general.eval_on_segments = True + cfg.general.topk_per_image = 300 + cfg.general.use_dbscan = True + cfg.general.dbscan_eps = 0.95 + cfg.general.export_threshold = 0.001 + + # # data + cfg.data.num_labels = 200 + cfg.data.test_mode = "validation" + + # # model + cfg.model.num_queries = 150 + + if dataset_name == 'scannet': + cfg.general.num_targets = 19 + cfg.general.train_mode = False + cfg.general.eval_on_segments = True + cfg.general.topk_per_image = 300 + cfg.general.use_dbscan = True + cfg.general.dbscan_eps = 0.95 + cfg.general.export_threshold = 0.001 + + # # data + cfg.data.num_labels = 20 + cfg.data.test_mode = "test" + + # # model + cfg.model.num_queries = 150 + + #TODO: this has to be fixed and discussed with Jonas + # cfg.model.scene_min = -3. + # cfg.model.scene_max = 3. + + # # Initialize the Hydra context + # hydra.core.global_hydra.GlobalHydra.instance().clear() + # hydra.initialize(config_path="conf") + + # Load the configuration + # cfg = hydra.compose(config_name="config_base_instance_segmentation.yaml") + model = InstanceSegmentation(cfg) + + if cfg.general.backbone_checkpoint is not None: + cfg, model = load_backbone_checkpoint_with_missing_or_exsessive_keys( + cfg, model + ) + if cfg.general.checkpoint is not None: + cfg, model = load_checkpoint_with_missing_or_exsessive_keys(cfg, model) + + return model + + +def load_mesh(pcl_file): + + # load point cloud + input_mesh_path = pcl_file + mesh = o3d.io.read_triangle_mesh(input_mesh_path) + return mesh + +def prepare_data(mesh, device): + + # normalization for point cloud features + color_mean = (0.47793125906962, 0.4303257521323044, 0.3749598901421883) + color_std = (0.2834475483823543, 0.27566157565723015, 0.27018971370874995) + normalize_color = A.Normalize(mean=color_mean, std=color_std) + + + points = np.asarray(mesh.vertices) + colors = np.asarray(mesh.vertex_colors) + colors = colors * 255. + + pseudo_image = colors.astype(np.uint8)[np.newaxis, :, :] + colors = np.squeeze(normalize_color(image=pseudo_image)["image"]) + + coords = np.floor(points / 0.02) + _, _, unique_map, inverse_map = ME.utils.sparse_quantize( + coordinates=coords, + features=colors, + return_index=True, + return_inverse=True, + ) + + sample_coordinates = coords[unique_map] + coordinates = [torch.from_numpy(sample_coordinates).int()] + sample_features = colors[unique_map] + features = [torch.from_numpy(sample_features).float()] + + coordinates, _ = ME.utils.sparse_collate(coords=coordinates, feats=features) + features = torch.cat(features, dim=0) + data = ME.SparseTensor( + coordinates=coordinates, + features=features, + device=device, + ) + + + return data, points, colors, features, unique_map, inverse_map + + +def map_output_to_pointcloud(mesh, + outputs, + inverse_map): + + # parse predictions + logits = outputs["pred_logits"] + masks = outputs["pred_masks"] + + # reformat predictions + logits = logits[0] + masks = masks[0] + + labels = [] + confidences = [] + masks_binary = [] + + for i in range(len(logits)): + p_labels = torch.softmax(logits[i], dim=-1) + p_masks = torch.sigmoid(masks[:, i]) + l = torch.argmax(p_labels, dim=-1) + c_label = torch.max(p_labels) + m = p_masks > 0.5 + c_m = p_masks[m].sum() / (m.sum() + 1e-8) + c = c_label * c_m + labels.append(l.item()) + confidences.append(c.item()) + masks_binary.append(m[inverse_map]) # mapping the mask back to the original point cloud + return (torch.stack(masks_binary), torch.tensor(confidences)) + +def save_colorized_mesh(mesh, labels_mapped, output_file, colormap='scannet'): + + # colorize mesh + colors = np.zeros((len(mesh.vertices), 3)) + for li in np.unique(labels_mapped): + if colormap == 'scannet': + raise ValueError('Not implemented yet') + elif colormap == 'scannet200': + v_li = VALID_CLASS_IDS_200[int(li)] + colors[(labels_mapped == li)[:, 0], :] = SCANNET_COLOR_MAP_200[v_li] + else: + raise ValueError('Unknown colormap - not supported') + + colors = colors / 255. + mesh.vertex_colors = o3d.utility.Vector3dVector(colors) + o3d.io.write_triangle_mesh(output_file, mesh) + +if __name__ == '__main__': + + model = get_model('checkpoints/scannet200/scannet200_benchmark.ckpt') + model.eval() + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + + # load input data + pointcloud_file = 'data/pcl.ply' + mesh = load_mesh(pointcloud_file) + + # prepare data + data, points, colors, features, unique_map, inverse_map = prepare_data(mesh, device) + + # run model + with torch.no_grad(): + outputs = model(data, raw_coordinates=features) + + # map output to point cloud + labels = map_output_to_pointcloud(mesh, outputs, inverse_map) + + # save colorized mesh + save_colorized_mesh(mesh, labels, 'data/pcl_labelled.ply', colormap='scannet200') + \ No newline at end of file diff --git a/models/Mask3D/build/lib/mask3d/benchmark/__init__.py b/models/Mask3D/build/lib/mask3d/benchmark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/benchmark/evaluate_semantic_instance.py b/models/Mask3D/build/lib/mask3d/benchmark/evaluate_semantic_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..242cb87a09b5c69a0d967217a2cd97706197a63d --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/benchmark/evaluate_semantic_instance.py @@ -0,0 +1,1141 @@ +# Evaluates semantic instance task +# Adapted from the CityScapes evaluation: https://github.com/mcordts/cityscapesScripts/tree/master/cityscapesscripts/evaluation +# Input: +# - path to .txt prediction files +# - path to .txt ground truth files +# - output file to write results to +# Each .txt prediction file look like: +# [(pred0) rel. path to pred. mask over verts as .txt] [(pred0) label id] [(pred0) confidence] +# [(pred1) rel. path to pred. mask over verts as .txt] [(pred1) label id] [(pred1) confidence] +# [(pred2) rel. path to pred. mask over verts as .txt] [(pred2) label id] [(pred2) confidence] +# ... +# +# NOTE: The prediction files must live in the root of the given prediction path. +# Predicted mask .txt files must live in a subfolder. +# Additionally, filenames must not contain spaces. +# The relative paths to predicted masks must contain one integer per line, +# where each line corresponds to vertices in the *_vh_clean_2.ply (in that order). +# Non-zero integers indicate part of the predicted instance. +# The label ids specify the class of the corresponding mask. +# Confidence is a float confidence score of the mask. +# +# Note that only the valid classes are used for evaluation, +# i.e., any ground truth label not in the valid label set +# is ignored in the evaluation. +# +# example usage: evaluate_semantic_instance.py --scan_path [path to scan data] --output_file [output file] + +# python imports +import math +import os, sys, argparse +import inspect +from copy import deepcopy +from uuid import uuid4 + +import torch + +try: + import numpy as np +except: + print("Failed to import numpy package.") + sys.exit(-1) + +from scipy import stats + +# currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +# parentdir = os.path.dirname(currentdir) +# sys.path.insert(0,parentdir) +import benchmark.util as util +import benchmark.util_3d as util_3d + +# parser = argparse.ArgumentParser() +# parser.add_argument('--gt_path', default='', help='path to directory of gt .txt files') +# parser.add_argument('--output_file', default='', help='output file [default: ./semantic_instance_evaluation.txt]') +# opt = parser.parse_args() + +# if opt.output_file == '': +# opt.output_file = os.path.join(os.getcwd(), 'semantic_instance_evaluation.txt') + + +# ---------- Label info ---------- # +CLASS_LABELS = [ + "cabinet", + "bed", + "chair", + "sofa", + "table", + "door", + "window", + "bookshelf", + "picture", + "counter", + "desk", + "curtain", + "refrigerator", + "shower curtain", + "toilet", + "sink", + "bathtub", + "otherfurniture", +] +VALID_CLASS_IDS = np.array( + [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39] +) +ID_TO_LABEL = {} +LABEL_TO_ID = {} +for i in range(len(VALID_CLASS_IDS)): + LABEL_TO_ID[CLASS_LABELS[i]] = VALID_CLASS_IDS[i] + ID_TO_LABEL[VALID_CLASS_IDS[i]] = CLASS_LABELS[i] +# ---------- Evaluation params ---------- # +# overlaps for evaluation +opt = {} +opt["overlaps"] = np.append(np.arange(0.5, 0.95, 0.05), 0.25) +# minimum region size for evaluation [verts] +opt["min_region_sizes"] = np.array([100]) # 100 for s3dis, scannet +# distance thresholds [m] +opt["distance_threshes"] = np.array([float("inf")]) +# distance confidences +opt["distance_confs"] = np.array([-float("inf")]) + + +def evaluate_matches(matches): + overlaps = opt["overlaps"] + min_region_sizes = [opt["min_region_sizes"][0]] + dist_threshes = [opt["distance_threshes"][0]] + dist_confs = [opt["distance_confs"][0]] + + # results: class x overlap + ap = np.zeros( + (len(dist_threshes), len(CLASS_LABELS), len(overlaps)), float + ) + for di, (min_region_size, distance_thresh, distance_conf) in enumerate( + zip(min_region_sizes, dist_threshes, dist_confs) + ): + for oi, overlap_th in enumerate(overlaps): + pred_visited = {} + for m in matches: + for p in matches[m]["pred"]: + for label_name in CLASS_LABELS: + for p in matches[m]["pred"][label_name]: + if "uuid" in p: + pred_visited[p["uuid"]] = False + for li, label_name in enumerate(CLASS_LABELS): + y_true = np.empty(0) + y_score = np.empty(0) + hard_false_negatives = 0 + has_gt = False + has_pred = False + for m in matches: + pred_instances = matches[m]["pred"][label_name] + gt_instances = matches[m]["gt"][label_name] + # filter groups in ground truth + gt_instances = [ + gt + for gt in gt_instances + if gt["instance_id"] >= 1000 + and gt["vert_count"] >= min_region_size + and gt["med_dist"] <= distance_thresh + and gt["dist_conf"] >= distance_conf + ] + if gt_instances: + has_gt = True + if pred_instances: + has_pred = True + + cur_true = np.ones(len(gt_instances)) + cur_score = np.ones(len(gt_instances)) * (-float("inf")) + cur_match = np.zeros(len(gt_instances), dtype=bool) + # collect matches + for (gti, gt) in enumerate(gt_instances): + found_match = False + num_pred = len(gt["matched_pred"]) + for pred in gt["matched_pred"]: + # greedy assignments + if pred_visited[pred["uuid"]]: + continue + overlap = float(pred["intersection"]) / ( + gt["vert_count"] + + pred["vert_count"] + - pred["intersection"] + ) + if overlap > overlap_th: + confidence = pred["confidence"] + # if already have a prediction for this gt, + # the prediction with the lower score is automatically a false positive + if cur_match[gti]: + max_score = max(cur_score[gti], confidence) + min_score = min(cur_score[gti], confidence) + cur_score[gti] = max_score + # append false positive + cur_true = np.append(cur_true, 0) + cur_score = np.append(cur_score, min_score) + cur_match = np.append(cur_match, True) + # otherwise set score + else: + found_match = True + cur_match[gti] = True + cur_score[gti] = confidence + pred_visited[pred["uuid"]] = True + if not found_match: + hard_false_negatives += 1 + # remove non-matched ground truth instances + cur_true = cur_true[cur_match == True] + cur_score = cur_score[cur_match == True] + + # collect non-matched predictions as false positive + for pred in pred_instances: + found_gt = False + for gt in pred["matched_gt"]: + overlap = float(gt["intersection"]) / ( + gt["vert_count"] + + pred["vert_count"] + - gt["intersection"] + ) + if overlap > overlap_th: + found_gt = True + break + if not found_gt: + num_ignore = pred["void_intersection"] + for gt in pred["matched_gt"]: + # group? + if gt["instance_id"] < 1000: + num_ignore += gt["intersection"] + # small ground truth instances + if ( + gt["vert_count"] < min_region_size + or gt["med_dist"] > distance_thresh + or gt["dist_conf"] < distance_conf + ): + num_ignore += gt["intersection"] + proportion_ignore = ( + float(num_ignore) / pred["vert_count"] + ) + # if not ignored append false positive + if proportion_ignore <= overlap_th: + cur_true = np.append(cur_true, 0) + confidence = pred["confidence"] + cur_score = np.append(cur_score, confidence) + + # append to overall results + y_true = np.append(y_true, cur_true) + y_score = np.append(y_score, cur_score) + + # compute average precision + if has_gt and has_pred: + # compute precision recall curve first + + # sorting and cumsum + score_arg_sort = np.argsort(y_score) + y_score_sorted = y_score[score_arg_sort] + y_true_sorted = y_true[score_arg_sort] + y_true_sorted_cumsum = np.cumsum(y_true_sorted) + + # unique thresholds + (thresholds, unique_indices) = np.unique( + y_score_sorted, return_index=True + ) + num_prec_recall = len(unique_indices) + 1 + + # prepare precision recall + num_examples = len(y_score_sorted) + # https://github.com/ScanNet/ScanNet/pull/26 + # all predictions are non-matched but also all of them are ignored and not counted as FP + # y_true_sorted_cumsum is empty + # num_true_examples = y_true_sorted_cumsum[-1] + num_true_examples = ( + y_true_sorted_cumsum[-1] + if len(y_true_sorted_cumsum) > 0 + else 0 + ) + precision = np.zeros(num_prec_recall) + recall = np.zeros(num_prec_recall) + + # deal with the first point + y_true_sorted_cumsum = np.append(y_true_sorted_cumsum, 0) + # deal with remaining + for idx_res, idx_scores in enumerate(unique_indices): + cumsum = y_true_sorted_cumsum[idx_scores - 1] + tp = num_true_examples - cumsum + fp = num_examples - idx_scores - tp + fn = cumsum + hard_false_negatives + p = float(tp) / (tp + fp) + r = float(tp) / (tp + fn) + precision[idx_res] = p + recall[idx_res] = r + + # first point in curve is artificial + precision[-1] = 1.0 + recall[-1] = 0.0 + + # compute average of precision-recall curve + recall_for_conv = np.copy(recall) + recall_for_conv = np.append( + recall_for_conv[0], recall_for_conv + ) + recall_for_conv = np.append(recall_for_conv, 0.0) + + stepWidths = np.convolve( + recall_for_conv, [-0.5, 0, 0.5], "valid" + ) + # integrate is now simply a dot product + ap_current = np.dot(precision, stepWidths) + + elif has_gt: + ap_current = 0.0 + else: + ap_current = float("nan") + ap[di, li, oi] = ap_current + return ap + + +def compute_averages(aps): + d_inf = 0 + o50 = np.where(np.isclose(opt["overlaps"], 0.5)) + o25 = np.where(np.isclose(opt["overlaps"], 0.25)) + oAllBut25 = np.where(np.logical_not(np.isclose(opt["overlaps"], 0.25))) + avg_dict = {} + # avg_dict['all_ap'] = np.nanmean(aps[ d_inf,:,: ]) + avg_dict["all_ap"] = np.nanmean(aps[d_inf, :, oAllBut25]) + avg_dict["all_ap_50%"] = np.nanmean(aps[d_inf, :, o50]) + avg_dict["all_ap_25%"] = np.nanmean(aps[d_inf, :, o25]) + avg_dict["classes"] = {} + for (li, label_name) in enumerate(CLASS_LABELS): + avg_dict["classes"][label_name] = {} + # avg_dict["classes"][label_name]["ap"] = np.average(aps[ d_inf,li, :]) + avg_dict["classes"][label_name]["ap"] = np.average( + aps[d_inf, li, oAllBut25] + ) + avg_dict["classes"][label_name]["ap50%"] = np.average( + aps[d_inf, li, o50] + ) + avg_dict["classes"][label_name]["ap25%"] = np.average( + aps[d_inf, li, o25] + ) + return avg_dict + + +def make_pred_info(pred: dict): + # pred = {'pred_scores' = 100, 'pred_classes' = 100 'pred_masks' = Nx100} + pred_info = {} + assert ( + pred["pred_classes"].shape[0] + == pred["pred_scores"].shape[0] + == pred["pred_masks"].shape[1] + ) + for i in range(len(pred["pred_classes"])): + info = {} + info["label_id"] = pred["pred_classes"][i] + info["conf"] = pred["pred_scores"][i] + info["mask"] = pred["pred_masks"][:, i] + pred_info[uuid4()] = info # we later need to identify these objects + return pred_info + + +def assign_instances_for_scan(pred: dict, gt_file: str): + pred_info = make_pred_info(pred) + try: + gt_ids = util_3d.load_ids(gt_file) + except Exception as e: + util.print_error("unable to load " + gt_file + ": " + str(e)) + + # get gt instances + gt_instances = util_3d.get_instances( + gt_ids, VALID_CLASS_IDS, CLASS_LABELS, ID_TO_LABEL + ) + # associate + gt2pred = deepcopy(gt_instances) + for label in gt2pred: + for gt in gt2pred[label]: + gt["matched_pred"] = [] + pred2gt = {} + for label in CLASS_LABELS: + pred2gt[label] = [] + num_pred_instances = 0 + # mask of void labels in the groundtruth + bool_void = np.logical_not(np.in1d(gt_ids // 1000, VALID_CLASS_IDS)) + # go thru all prediction masks + for uuid in pred_info: + label_id = int(pred_info[uuid]["label_id"]) + conf = pred_info[uuid]["conf"] + if not label_id in ID_TO_LABEL: + continue + label_name = ID_TO_LABEL[label_id] + # read the mask + pred_mask = pred_info[uuid]["mask"] + assert len(pred_mask) == len(gt_ids) + # convert to binary + pred_mask = np.not_equal(pred_mask, 0) + num = np.count_nonzero(pred_mask) + if num < opt["min_region_sizes"][0]: + continue # skip if empty + + pred_instance = {} + pred_instance["uuid"] = uuid + pred_instance["pred_id"] = num_pred_instances + pred_instance["label_id"] = label_id + pred_instance["vert_count"] = num + pred_instance["confidence"] = conf + pred_instance["void_intersection"] = np.count_nonzero( + np.logical_and(bool_void, pred_mask) + ) + + # matched gt instances + matched_gt = [] + # go thru all gt instances with matching label + for (gt_num, gt_inst) in enumerate(gt2pred[label_name]): + intersection = np.count_nonzero( + np.logical_and(gt_ids == gt_inst["instance_id"], pred_mask) + ) + if intersection > 0: + gt_copy = gt_inst.copy() + pred_copy = pred_instance.copy() + gt_copy["intersection"] = intersection + pred_copy["intersection"] = intersection + matched_gt.append(gt_copy) + gt2pred[label_name][gt_num]["matched_pred"].append(pred_copy) + pred_instance["matched_gt"] = matched_gt + num_pred_instances += 1 + pred2gt[label_name].append(pred_instance) + + return gt2pred, pred2gt + + +def print_results(avgs): + sep = "" + col1 = ":" + lineLen = 64 + + print("") + print("#" * lineLen) + line = "" + line += "{:<15}".format("what") + sep + col1 + line += "{:>15}".format("AP") + sep + line += "{:>15}".format("AP_50%") + sep + line += "{:>15}".format("AP_25%") + sep + print(line) + print("#" * lineLen) + + for (li, label_name) in enumerate(CLASS_LABELS): + ap_avg = avgs["classes"][label_name]["ap"] + ap_50o = avgs["classes"][label_name]["ap50%"] + ap_25o = avgs["classes"][label_name]["ap25%"] + line = "{:<15}".format(label_name) + sep + col1 + line += sep + "{:>15.3f}".format(ap_avg) + sep + line += sep + "{:>15.3f}".format(ap_50o) + sep + line += sep + "{:>15.3f}".format(ap_25o) + sep + print(line) + + all_ap_avg = avgs["all_ap"] + all_ap_50o = avgs["all_ap_50%"] + all_ap_25o = avgs["all_ap_25%"] + + print("-" * lineLen) + line = "{:<15}".format("average") + sep + col1 + line += "{:>15.3f}".format(all_ap_avg) + sep + line += "{:>15.3f}".format(all_ap_50o) + sep + line += "{:>15.3f}".format(all_ap_25o) + sep + print(line) + print("") + + +def write_result_file(avgs, filename): + _SPLITTER = "," + with open(filename, "w") as f: + f.write( + _SPLITTER.join(["class", "class id", "ap", "ap50", "ap25"]) + "\n" + ) + for i in range(len(VALID_CLASS_IDS)): + class_name = CLASS_LABELS[i] + class_id = VALID_CLASS_IDS[i] + ap = avgs["classes"][class_name]["ap"] + ap50 = avgs["classes"][class_name]["ap50%"] + ap25 = avgs["classes"][class_name]["ap25%"] + f.write( + _SPLITTER.join( + [str(x) for x in [class_name, class_id, ap, ap50, ap25]] + ) + + "\n" + ) + + +def evaluate( + preds: dict, gt_path: str, output_file: str, dataset: str = "scannet" +): + global CLASS_LABELS + global VALID_CLASS_IDS + global ID_TO_LABEL + global LABEL_TO_ID + global opt + + if dataset == "stpls3d": + # global CLASS_LABELS + # global VALID_CLASS_IDS + # global ID_TO_LABEL + # global LABEL_TO_ID + + opt["min_region_sizes"] = np.array([10]) + + CLASS_LABELS = [ + "Build", + "LowVeg", + "MediumVeg", + "HighVeg", + "Vehicle", + "Truck", + "Aircraft", + "MilitaryVeh", + "Bike", + "Motorcycle", + "LightPole", + "StreetSign", + "Clutter", + "Fence", + ] + VALID_CLASS_IDS = np.array( + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] + ) + + ID_TO_LABEL = {} + LABEL_TO_ID = {} + for i in range(len(VALID_CLASS_IDS)): + LABEL_TO_ID[CLASS_LABELS[i]] = VALID_CLASS_IDS[i] + ID_TO_LABEL[VALID_CLASS_IDS[i]] = CLASS_LABELS[i] + + if dataset == "s3dis": + # global CLASS_LABELS + # global VALID_CLASS_IDS + # global ID_TO_LABEL + # global LABEL_TO_ID + + CLASS_LABELS = [ + "ceiling", + "floor", + "wall", + "beam", + "column", + "window", + "door", + "table", + "chair", + "sofa", + "bookcase", + "board", + "clutter", + ] + VALID_CLASS_IDS = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) + ID_TO_LABEL = {} + LABEL_TO_ID = {} + for i in range(len(VALID_CLASS_IDS)): + LABEL_TO_ID[CLASS_LABELS[i]] = VALID_CLASS_IDS[i] + ID_TO_LABEL[VALID_CLASS_IDS[i]] = CLASS_LABELS[i] + + if dataset == "scannet200": + CLASS_LABELS = ( + "chair", + "table", + "door", + "couch", + "cabinet", + "shelf", + "desk", + "office chair", + "bed", + "pillow", + "sink", + "picture", + "window", + "toilet", + "bookshelf", + "monitor", + "curtain", + "book", + "armchair", + "coffee table", + "box", + "refrigerator", + "lamp", + "kitchen cabinet", + "towel", + "clothes", + "tv", + "nightstand", + "counter", + "dresser", + "stool", + "cushion", + "plant", + "ceiling", + "bathtub", + "end table", + "dining table", + "keyboard", + "bag", + "backpack", + "toilet paper", + "printer", + "tv stand", + "whiteboard", + "blanket", + "shower curtain", + "trash can", + "closet", + "stairs", + "microwave", + "stove", + "shoe", + "computer tower", + "bottle", + "bin", + "ottoman", + "bench", + "board", + "washing machine", + "mirror", + "copier", + "basket", + "sofa chair", + "file cabinet", + "fan", + "laptop", + "shower", + "paper", + "person", + "paper towel dispenser", + "oven", + "blinds", + "rack", + "plate", + "blackboard", + "piano", + "suitcase", + "rail", + "radiator", + "recycling bin", + "container", + "wardrobe", + "soap dispenser", + "telephone", + "bucket", + "clock", + "stand", + "light", + "laundry basket", + "pipe", + "clothes dryer", + "guitar", + "toilet paper holder", + "seat", + "speaker", + "column", + "bicycle", + "ladder", + "bathroom stall", + "shower wall", + "cup", + "jacket", + "storage bin", + "coffee maker", + "dishwasher", + "paper towel roll", + "machine", + "mat", + "windowsill", + "bar", + "toaster", + "bulletin board", + "ironing board", + "fireplace", + "soap dish", + "kitchen counter", + "doorframe", + "toilet paper dispenser", + "mini fridge", + "fire extinguisher", + "ball", + "hat", + "shower curtain rod", + "water cooler", + "paper cutter", + "tray", + "shower door", + "pillar", + "ledge", + "toaster oven", + "mouse", + "toilet seat cover dispenser", + "furniture", + "cart", + "storage container", + "scale", + "tissue box", + "light switch", + "crate", + "power outlet", + "decoration", + "sign", + "projector", + "closet door", + "vacuum cleaner", + "candle", + "plunger", + "stuffed animal", + "headphones", + "dish rack", + "broom", + "guitar case", + "range hood", + "dustpan", + "hair dryer", + "water bottle", + "handicap bar", + "purse", + "vent", + "shower floor", + "water pitcher", + "mailbox", + "bowl", + "paper bag", + "alarm clock", + "music stand", + "projector screen", + "divider", + "laundry detergent", + "bathroom counter", + "object", + "bathroom vanity", + "closet wall", + "laundry hamper", + "bathroom stall door", + "ceiling light", + "trash bin", + "dumbbell", + "stair rail", + "tube", + "bathroom cabinet", + "cd case", + "closet rod", + "coffee kettle", + "structure", + "shower head", + "keyboard piano", + "case of water bottles", + "coat rack", + "storage organizer", + "folded chair", + "fire alarm", + "power strip", + "calendar", + "poster", + "potted plant", + "luggage", + "mattress", + ) + + VALID_CLASS_IDS = np.array( + ( + 2, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 21, + 22, + 23, + 24, + 26, + 27, + 28, + 29, + 31, + 32, + 33, + 34, + 35, + 36, + 38, + 39, + 40, + 41, + 42, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 54, + 55, + 56, + 57, + 58, + 59, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 82, + 84, + 86, + 87, + 88, + 89, + 90, + 93, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 110, + 112, + 115, + 116, + 118, + 120, + 121, + 122, + 125, + 128, + 130, + 131, + 132, + 134, + 136, + 138, + 139, + 140, + 141, + 145, + 148, + 154, + 155, + 156, + 157, + 159, + 161, + 163, + 165, + 166, + 168, + 169, + 170, + 177, + 180, + 185, + 188, + 191, + 193, + 195, + 202, + 208, + 213, + 214, + 221, + 229, + 230, + 232, + 233, + 242, + 250, + 261, + 264, + 276, + 283, + 286, + 300, + 304, + 312, + 323, + 325, + 331, + 342, + 356, + 370, + 392, + 395, + 399, + 408, + 417, + 488, + 540, + 562, + 570, + 572, + 581, + 609, + 748, + 776, + 1156, + 1163, + 1164, + 1165, + 1166, + 1167, + 1168, + 1169, + 1170, + 1171, + 1172, + 1173, + 1174, + 1175, + 1176, + 1178, + 1179, + 1180, + 1181, + 1182, + 1183, + 1184, + 1185, + 1186, + 1187, + 1188, + 1189, + 1190, + 1191, + ) + ) + + ID_TO_LABEL = {} + LABEL_TO_ID = {} + for i in range(len(VALID_CLASS_IDS)): + LABEL_TO_ID[CLASS_LABELS[i]] = VALID_CLASS_IDS[i] + ID_TO_LABEL[VALID_CLASS_IDS[i]] = CLASS_LABELS[i] + + total_true = 0 + total_seen = 0 + NUM_CLASSES = len(VALID_CLASS_IDS) + + true_positive_classes = np.zeros(NUM_CLASSES) + positive_classes = np.zeros(NUM_CLASSES) + gt_classes = np.zeros(NUM_CLASSES) + + # precision & recall + total_gt_ins = np.zeros(NUM_CLASSES) + at = 0.5 + tpsins = [[] for _ in range(NUM_CLASSES)] + fpsins = [[] for _ in range(NUM_CLASSES)] + # mucov and mwcov + all_mean_cov = [[] for _ in range(NUM_CLASSES)] + all_mean_weighted_cov = [[] for _ in range(NUM_CLASSES)] + + print("evaluating", len(preds), "scans...") + matches = {} + for i, (k, v) in enumerate(preds.items()): + gt_file = os.path.join(gt_path, k + ".txt") + if not os.path.isfile(gt_file): + util.print_error( + "Scan {} does not match any gt file".format(k), user_fault=True + ) + + if dataset == "s3dis": + gt_ids = util_3d.load_ids(gt_file) + gt_sem = (gt_ids // 1000) - 1 + gt_ins = gt_ids - (gt_ids // 1000) * 1000 + + # pred_sem = v['pred_classes'] - 1 + pred_sem = np.zeros(v["pred_masks"].shape[0], dtype=np.int) + # TODO CONTINUE HERE!!!!!!!!!!!!! + pred_ins = np.zeros(v["pred_masks"].shape[0], dtype=np.int) + + for inst_id in reversed(range(v["pred_masks"].shape[1])): + point_ids = np.argwhere(v["pred_masks"][:, inst_id] == 1.0)[ + :, 0 + ] + pred_ins[point_ids] = inst_id + 1 + pred_sem[point_ids] = v["pred_classes"][inst_id] - 1 + + # semantic acc + total_true += np.sum(pred_sem == gt_sem) + total_seen += pred_sem.shape[0] + + # TODO PARALLELIZ THIS!!!!!!! + # pn semantic mIoU + """ + for j in range(gt_sem.shape[0]): + gt_l = int(gt_sem[j]) + pred_l = int(pred_sem[j]) + gt_classes[gt_l] += 1 + positive_classes[pred_l] += 1 + true_positive_classes[gt_l] += int(gt_l == pred_l) + """ + + uniq, counts = np.unique(pred_sem, return_counts=True) + positive_classes[uniq] += counts + + uniq, counts = np.unique(gt_sem, return_counts=True) + gt_classes[uniq] += counts + + uniq, counts = np.unique( + gt_sem[pred_sem == gt_sem], return_counts=True + ) + true_positive_classes[uniq] += counts + + # instance + un = np.unique(pred_ins) + pts_in_pred = [[] for _ in range(NUM_CLASSES)] + for ig, g in enumerate(un): # each object in prediction + if g == -1: + continue + tmp = pred_ins == g + sem_seg_i = int(stats.mode(pred_sem[tmp])[0]) + pts_in_pred[sem_seg_i] += [tmp] + + un = np.unique(gt_ins) + pts_in_gt = [[] for _ in range(NUM_CLASSES)] + for ig, g in enumerate(un): + tmp = gt_ins == g + sem_seg_i = int(stats.mode(gt_sem[tmp])[0]) + pts_in_gt[sem_seg_i] += [tmp] + + # instance mucov & mwcov + for i_sem in range(NUM_CLASSES): + sum_cov = 0 + mean_cov = 0 + mean_weighted_cov = 0 + num_gt_point = 0 + for ig, ins_gt in enumerate(pts_in_gt[i_sem]): + ovmax = 0.0 + num_ins_gt_point = np.sum(ins_gt) + num_gt_point += num_ins_gt_point + for ip, ins_pred in enumerate(pts_in_pred[i_sem]): + union = ins_pred | ins_gt + intersect = ins_pred & ins_gt + iou = float(np.sum(intersect)) / np.sum(union) + + if iou > ovmax: + ovmax = iou + ipmax = ip + + sum_cov += ovmax + mean_weighted_cov += ovmax * num_ins_gt_point + + if len(pts_in_gt[i_sem]) != 0: + mean_cov = sum_cov / len(pts_in_gt[i_sem]) + all_mean_cov[i_sem].append(mean_cov) + + mean_weighted_cov /= num_gt_point + all_mean_weighted_cov[i_sem].append(mean_weighted_cov) + + if dataset == "s3dis": + # instance precision & recall + for i_sem in range(NUM_CLASSES): + tp = [0.0] * len(pts_in_pred[i_sem]) + fp = [0.0] * len(pts_in_pred[i_sem]) + gtflag = np.zeros(len(pts_in_gt[i_sem])) + total_gt_ins[i_sem] += len(pts_in_gt[i_sem]) + + for ip, ins_pred in enumerate(pts_in_pred[i_sem]): + ovmax = -1.0 + + for ig, ins_gt in enumerate(pts_in_gt[i_sem]): + union = ins_pred | ins_gt + intersect = ins_pred & ins_gt + iou = float(np.sum(intersect)) / np.sum(union) + + if iou > ovmax: + ovmax = iou + igmax = ig + + if ovmax >= at: + tp[ip] = 1 # true + else: + fp[ip] = 1 # false positive + + tpsins[i_sem] += tp + fpsins[i_sem] += fp + + matches_key = os.path.abspath(gt_file) + # assign gt to predictions + gt2pred, pred2gt = assign_instances_for_scan(v, gt_file) + matches[matches_key] = {} + matches[matches_key]["gt"] = gt2pred + matches[matches_key]["pred"] = pred2gt + sys.stdout.write("\rscans processed: {}".format(i + 1)) + sys.stdout.flush() + print("") + ap_scores = evaluate_matches(matches) + avgs = compute_averages(ap_scores) + + # print + print_results(avgs) + write_result_file(avgs, output_file) + + if dataset == "s3dis": + MUCov = np.zeros(NUM_CLASSES) + MWCov = np.zeros(NUM_CLASSES) + for i_sem in range(NUM_CLASSES): + MUCov[i_sem] = np.mean(all_mean_cov[i_sem]) + MWCov[i_sem] = np.mean(all_mean_weighted_cov[i_sem]) + + precision = np.zeros(NUM_CLASSES) + recall = np.zeros(NUM_CLASSES) + for i_sem in range(NUM_CLASSES): + tp = np.asarray(tpsins[i_sem]).astype(np.float) + fp = np.asarray(fpsins[i_sem]).astype(np.float) + tp = np.sum(tp) + fp = np.sum(fp) + rec = tp / total_gt_ins[i_sem] + prec = tp / (tp + fp) + + precision[i_sem] = prec + recall[i_sem] = rec + + """ + LOG_FOUT = open(os.path.join('results_a5.txt'), 'w') + + def log_string(out_str): + LOG_FOUT.write(out_str + '\n') + LOG_FOUT.flush() + print(out_str) + """ + + return np.mean(precision), np.mean(recall) + + +# TODO: remove this +# import pandas as pd +# def main(): +# print("!!! CLI is only for debugging purposes. use `evaluate()` instead.") +# evaluate(pd.read_pickle("/globalwork/schult/saved_predictions.pkl"), opt.gt_path, opt.output_file) + +# if __name__ == '__main__': +# main() diff --git a/models/Mask3D/build/lib/mask3d/benchmark/util.py b/models/Mask3D/build/lib/mask3d/benchmark/util.py new file mode 100644 index 0000000000000000000000000000000000000000..9a4224cd4f785c8a5a7cde490cf0f9999e61dbe7 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/benchmark/util.py @@ -0,0 +1,128 @@ +import os, sys +import csv + +try: + import numpy as np +except: + print("Failed to import numpy package.") + sys.exit(-1) +try: + import imageio +except: + print("Please install the module 'imageio' for image processing, e.g.") + print("pip install imageio") + sys.exit(-1) + +# print an error message and quit +def print_error(message, user_fault=False): + sys.stderr.write("ERROR: " + str(message) + "\n") + if user_fault: + sys.exit(2) + sys.exit(-1) + + +# if string s represents an int +def represents_int(s): + try: + int(s) + return True + except ValueError: + return False + + +def read_label_mapping( + filename, label_from="raw_category", label_to="nyu40id" +): + assert os.path.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter="\t") + for row in reader: + mapping[row[label_from]] = int(row[label_to]) + # if ints convert + if represents_int(list(mapping.keys())[0]): + mapping = {int(k): v for k, v in mapping.items()} + return mapping + + +# input: scene_types.txt or scene_types_all.txt +def read_scene_types_mapping(filename, remove_spaces=True): + assert os.path.isfile(filename) + mapping = dict() + lines = open(filename).read().splitlines() + lines = [line.split("\t") for line in lines] + if remove_spaces: + mapping = {x[1].strip(): int(x[0]) for x in lines} + else: + mapping = {x[1]: int(x[0]) for x in lines} + return mapping + + +# color by label +def visualize_label_image(filename, image): + height = image.shape[0] + width = image.shape[1] + vis_image = np.zeros([height, width, 3], dtype=np.uint8) + color_palette = create_color_palette() + for idx, color in enumerate(color_palette): + vis_image[image == idx] = color + imageio.imwrite(filename, vis_image) + + +# color by different instances (mod length of color palette) +def visualize_instance_image(filename, image): + height = image.shape[0] + width = image.shape[1] + vis_image = np.zeros([height, width, 3], dtype=np.uint8) + color_palette = create_color_palette() + instances = np.unique(image) + for idx, inst in enumerate(instances): + vis_image[image == inst] = color_palette[inst % len(color_palette)] + imageio.imwrite(filename, vis_image) + + +# color palette for nyu40 labels +def create_color_palette(): + return [ + (0, 0, 0), + (174, 199, 232), # wall + (152, 223, 138), # floor + (31, 119, 180), # cabinet + (255, 187, 120), # bed + (188, 189, 34), # chair + (140, 86, 75), # sofa + (255, 152, 150), # table + (214, 39, 40), # door + (197, 176, 213), # window + (148, 103, 189), # bookshelf + (196, 156, 148), # picture + (23, 190, 207), # counter + (178, 76, 76), + (247, 182, 210), # desk + (66, 188, 102), + (219, 219, 141), # curtain + (140, 57, 197), + (202, 185, 52), + (51, 176, 203), + (200, 54, 131), + (92, 193, 61), + (78, 71, 183), + (172, 114, 82), + (255, 127, 14), # refrigerator + (91, 163, 138), + (153, 98, 156), + (140, 153, 101), + (158, 218, 229), # shower curtain + (100, 125, 154), + (178, 127, 135), + (120, 185, 128), + (146, 111, 194), + (44, 160, 44), # toilet + (112, 128, 144), # sink + (96, 207, 209), + (227, 119, 194), # bathtub + (213, 92, 176), + (94, 106, 211), + (82, 84, 163), # otherfurn + (100, 85, 144), + ] diff --git a/models/Mask3D/build/lib/mask3d/benchmark/util_3d.py b/models/Mask3D/build/lib/mask3d/benchmark/util_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..572064f3ca251563466ca6bfbe2c70dacdad205f --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/benchmark/util_3d.py @@ -0,0 +1,177 @@ +import os, sys +import json + +try: + import numpy as np +except: + print("Failed to import numpy package.") + sys.exit(-1) + +try: + from plyfile import PlyData, PlyElement +except: + print("Please install the module 'plyfile' for PLY i/o, e.g.") + print("pip install plyfile") + sys.exit(-1) + +import benchmark.util as util + + +# matrix: 4x4 np array +# points Nx3 np array +def transform_points(matrix, points): + assert len(points.shape) == 2 and points.shape[1] == 3 + num_points = points.shape[0] + p = np.concatenate([points, np.ones((num_points, 1))], axis=1) + p = np.matmul(matrix, np.transpose(p)) + p = np.transpose(p) + p[:, :3] /= p[:, 3, None] + return p[:, :3] + + +def export_ids(filename, ids): + with open(filename, "w") as f: + for id in ids: + f.write("%d\n" % id) + + +def load_ids(filename): + ids = open(filename).read().splitlines() + ids = np.array(ids, dtype=np.int64) + return ids + + +def read_mesh_vertices(filename): + assert os.path.isfile(filename) + with open(filename, "rb") as f: + plydata = PlyData.read(f) + num_verts = plydata["vertex"].count + vertices = np.zeros(shape=[num_verts, 3], dtype=np.float32) + vertices[:, 0] = plydata["vertex"].data["x"] + vertices[:, 1] = plydata["vertex"].data["y"] + vertices[:, 2] = plydata["vertex"].data["z"] + return vertices + + +# export 3d instance labels for instance evaluation +def export_instance_ids_for_eval(filename, label_ids, instance_ids): + assert label_ids.shape[0] == instance_ids.shape[0] + output_mask_path_relative = "pred_mask" + name = os.path.splitext(os.path.basename(filename))[0] + output_mask_path = os.path.join( + os.path.dirname(filename), output_mask_path_relative + ) + if not os.path.isdir(output_mask_path): + os.mkdir(output_mask_path) + insts = np.unique(instance_ids) + zero_mask = np.zeros(shape=(instance_ids.shape[0]), dtype=np.int32) + with open(filename, "w") as f: + for idx, inst_id in enumerate(insts): + if inst_id == 0: # 0 -> no instance for this vertex + continue + output_mask_file = os.path.join( + output_mask_path_relative, name + "_" + str(idx) + ".txt" + ) + loc = np.where(instance_ids == inst_id) + label_id = label_ids[loc[0][0]] + f.write("%s %d %f\n" % (output_mask_file, label_id, 1.0)) + # write mask + mask = np.copy(zero_mask) + mask[loc[0]] = 1 + export_ids(output_mask_file, mask) + + +# ------------ Instance Utils ------------ # + + +class Instance(object): + instance_id = 0 + label_id = 0 + vert_count = 0 + med_dist = -1 + dist_conf = 0.0 + + def __init__(self, mesh_vert_instances, instance_id): + if instance_id == -1: + return + self.instance_id = int(instance_id) + self.label_id = int(self.get_label_id(instance_id)) + self.vert_count = int( + self.get_instance_verts(mesh_vert_instances, instance_id) + ) + + def get_label_id(self, instance_id): + return int(instance_id // 1000) + + def get_instance_verts(self, mesh_vert_instances, instance_id): + return (mesh_vert_instances == instance_id).sum() + + def to_json(self): + return json.dumps( + self, default=lambda o: o.__dict__, sort_keys=True, indent=4 + ) + + def to_dict(self): + dict = {} + dict["instance_id"] = self.instance_id + dict["label_id"] = self.label_id + dict["vert_count"] = self.vert_count + dict["med_dist"] = self.med_dist + dict["dist_conf"] = self.dist_conf + return dict + + def from_json(self, data): + self.instance_id = int(data["instance_id"]) + self.label_id = int(data["label_id"]) + self.vert_count = int(data["vert_count"]) + if "med_dist" in data: + self.med_dist = float(data["med_dist"]) + self.dist_conf = float(data["dist_conf"]) + + def __str__(self): + return "(" + str(self.instance_id) + ")" + + +def read_instance_prediction_file(filename, pred_path): + lines = open(filename).read().splitlines() + instance_info = {} + abs_pred_path = os.path.abspath(pred_path) + for line in lines: + parts = line.split(" ") + if len(parts) != 3: + util.print_error( + "invalid instance prediction file. Expected (per line): [rel path prediction] [label id prediction] [confidence prediction]" + ) + if os.path.isabs(parts[0]): + util.print_error( + "invalid instance prediction file. First entry in line must be a relative path" + ) + mask_file = os.path.join(os.path.dirname(filename), parts[0]) + mask_file = os.path.abspath(mask_file) + # check that mask_file lives inside prediction path + if os.path.commonprefix([mask_file, abs_pred_path]) != abs_pred_path: + util.print_error( + "predicted mask {} in prediction text file {} points outside of prediction path.".format( + mask_file, filename + ) + ) + + info = {} + info["label_id"] = int(float(parts[1])) + info["conf"] = float(parts[2]) + instance_info[mask_file] = info + return instance_info + + +def get_instances(ids, class_ids, class_labels, id2label): + instances = {} + for label in class_labels: + instances[label] = [] + instance_ids = np.unique(ids) + for id in instance_ids: + if id == 0: + continue + inst = Instance(ids, id) + if inst.label_id in class_ids: + instances[id2label[inst.label_id]].append(inst.to_dict()) + return instances diff --git a/models/Mask3D/build/lib/mask3d/conf/__init__.py b/models/Mask3D/build/lib/mask3d/conf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/conf/augmentation/albumentations_aug.yaml b/models/Mask3D/build/lib/mask3d/conf/augmentation/albumentations_aug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..006663b4be251bf0f41ac2f66f855ae3d59a2878 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/augmentation/albumentations_aug.yaml @@ -0,0 +1,30 @@ +__version__: 0.4.5 +transform: + __class_fullname__: albumentations.core.composition.Compose + additional_targets: {} + bbox_params: null + keypoint_params: null + p: 1.0 + transforms: + - __class_fullname__: albumentations.augmentations.transforms.RandomBrightnessContrast + always_apply: true + brightness_by_max: true + brightness_limit: + - -0.2 + - 0.2 + contrast_limit: + - -0.2 + - 0.2 + p: 0.5 + - __class_fullname__: albumentations.augmentations.transforms.RGBShift + always_apply: true + b_shift_limit: + - -20 + - 20 + g_shift_limit: + - -20 + - 20 + p: 0.5 + r_shift_limit: + - -20 + - 20 diff --git a/models/Mask3D/build/lib/mask3d/conf/augmentation/volumentations_aug.yaml b/models/Mask3D/build/lib/mask3d/conf/augmentation/volumentations_aug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3b86407a2e735ad8dbba79f83746ceb79722aedf --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/augmentation/volumentations_aug.yaml @@ -0,0 +1,53 @@ +# pi = 3.14159265358979 +# pi/2 = 1.57079632679489 +# pi/3 = 1.04719755119659 +# pi/6 = 0.52359877559829 +# pi/12 = 0.26179938779914 +# pi/24 = 0.13089969389957 +# +__version__: 0.1.6 +transform: + __class_fullname__: volumentations.core.composition.Compose + additional_targets: {} + p: 1.0 + transforms: + - __class_fullname__: volumentations.augmentations.transforms.Scale3d + always_apply: true + p: 0.5 + scale_limit: + - - -0.1 + - 0.1 + - - -0.1 + - 0.1 + - - -0.1 + - 0.1 + - __class_fullname__: volumentations.augmentations.transforms.RotateAroundAxis3d + always_apply: true + axis: + - 0 + - 0 + - 1 + p: 0.5 + rotation_limit: + - -3.141592653589793 + - 3.141592653589793 + - __class_fullname__: volumentations.augmentations.transforms.RotateAroundAxis3d + always_apply: true + axis: + - 0 + - 1 + - 0 + p: 0.5 + rotation_limit: + - -0.13089969389957 + - 0.13089969389957 + - __class_fullname__: volumentations.augmentations.transforms.RotateAroundAxis3d + always_apply: true + axis: + - 1 + - 0 + - 0 + p: 0.5 + rotation_limit: + - -0.13089969389957 + - 0.13089969389957 diff --git a/models/Mask3D/build/lib/mask3d/conf/callbacks/callbacks_instance_segmentation.yaml b/models/Mask3D/build/lib/mask3d/conf/callbacks/callbacks_instance_segmentation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7f0958eed35ea4317ddc3f2378dd66336472c0fa --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/callbacks/callbacks_instance_segmentation.yaml @@ -0,0 +1,11 @@ +# @package _group_ +- _target_: pytorch_lightning.callbacks.ModelCheckpoint + monitor: val_mean_ap_50 + save_last: true + save_top_k: 1 + mode: max + dirpath: ${general.save_dir} + filename: "{epoch}-{val_mean_ap_50:.3f}" + every_n_epochs: 1 + +- _target_: pytorch_lightning.callbacks.LearningRateMonitor diff --git a/models/Mask3D/build/lib/mask3d/conf/config_base_instance_segmentation.yaml b/models/Mask3D/build/lib/mask3d/conf/config_base_instance_segmentation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..61aeae0519bd308a58293d07ee902beb6a64ed5d --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/config_base_instance_segmentation.yaml @@ -0,0 +1,75 @@ +general: + train_mode: true + task: "instance_segmentation" + seed: null + checkpoint: null + backbone_checkpoint: null + freeze_backbone: false # train only last layer + linear_probing_backbone: false + train_on_segments: false + eval_on_segments: false + filter_out_instances: false + save_visualizations: false + visualization_point_size: 20 + decoder_id: -1 + export: false + use_dbscan: false + ignore_class_threshold: 100 + project_name: scannet + workspace: jonasschult + experiment_name: DEBUG_ABLATION + num_targets: 19 + add_instance: true + dbscan_eps: 0.95 + dbscan_min_points: 1 + + + export_threshold: 0.0001 + + reps_per_epoch: 1 + + on_crops: false + + scores_threshold: 0.0 + iou_threshold: 1.0 + + area: 5 + + eval_inner_core: -1 # disabled + + topk_per_image: 100 + + ignore_mask_idx: [] + + max_batch_size: 99999999 + + save_dir: saved/${general.experiment_name} + # time/commit/md5(config)_uuid + # time/experiment_id/version_uuid + # experiment_id: 1 # commit[:8], or unique from logger + # version: 1 # md5[:8] of config + + gpus: 1 + +defaults: + - data: indoor + - data/data_loaders: simple_loader + - data/datasets: scannet + - data/collation_functions: voxelize_collate + - logging: full + - model: mask3d + - metrics: miou + - optimizer: adamw + - scheduler: onecyclelr + - trainer: trainer600 + - callbacks: callbacks_instance_segmentation + - matcher: hungarian_matcher + - loss: set_criterion + +hydra: + run: + dir: saved/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: saved/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} + # dir: ${general.save_dir} + subdir: ${hydra.job.num}_${hydra.job.id} diff --git a/models/Mask3D/build/lib/mask3d/conf/data/collation_functions/voxelize_collate.yaml b/models/Mask3D/build/lib/mask3d/conf/data/collation_functions/voxelize_collate.yaml new file mode 100644 index 0000000000000000000000000000000000000000..026552efb024e4e6fd90bf6bda9df283da2bf4c1 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/collation_functions/voxelize_collate.yaml @@ -0,0 +1,42 @@ +# @package data + +train_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.train_mode} + small_crops: false + very_small_crops: false + batch_instance: false + probing: ${general.linear_probing_backbone} + task: ${general.task} + ignore_class_threshold: ${general.ignore_class_threshold} + filter_out_classes: ${data.train_dataset.filter_out_classes} + label_offset: ${data.train_dataset.label_offset} + num_queries: ${model.num_queries} + +validation_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.validation_mode} + batch_instance: false + probing: ${general.linear_probing_backbone} + task: ${general.task} + ignore_class_threshold: ${general.ignore_class_threshold} + filter_out_classes: ${data.validation_dataset.filter_out_classes} + label_offset: ${data.validation_dataset.label_offset} + num_queries: ${model.num_queries} + +test_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.test_mode} + batch_instance: false + probing: ${general.linear_probing_backbone} + task: ${general.task} + ignore_class_threshold: ${general.ignore_class_threshold} + filter_out_classes: ${data.test_dataset.filter_out_classes} + label_offset: ${data.test_dataset.label_offset} + num_queries: ${model.num_queries} \ No newline at end of file diff --git a/models/Mask3D/build/lib/mask3d/conf/data/collation_functions/voxelize_collate_merge.yaml b/models/Mask3D/build/lib/mask3d/conf/data/collation_functions/voxelize_collate_merge.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d5d3471d143ddfe999d8f3031e41ba6efce2e879 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/collation_functions/voxelize_collate_merge.yaml @@ -0,0 +1,36 @@ +# @package data + +train_collation: + _target_: mask3d.datasets.utils.VoxelizeCollateMerge + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.train_mode} + small_crops: false + very_small_crops: false + scenes: 2 + batch_instance: false + make_one_pc_noise: false + place_nearby: false + place_far: false + proba: 1 + probing: ${general.linear_probing_backbone} + include_ignore: ${general.include_ignore} + task: ${general.task} + +validation_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.validation_mode} + probing: ${general.linear_probing_backbone} + include_ignore: ${general.include_ignore} + task: ${general.task} + +test_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.test_mode} + probing: ${general.linear_probing_backbone} + include_ignore: ${general.include_ignore} + task: ${general.task} diff --git a/models/Mask3D/build/lib/mask3d/conf/data/data_loaders/simple_loader.yaml b/models/Mask3D/build/lib/mask3d/conf/data/data_loaders/simple_loader.yaml new file mode 100644 index 0000000000000000000000000000000000000000..39996e14d769c2ba9341da582a1f7bf970fc7925 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/data_loaders/simple_loader.yaml @@ -0,0 +1,22 @@ +# @package data + +train_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: true + pin_memory: ${data.pin_memory} + num_workers: ${data.num_workers} + batch_size: ${data.batch_size} + +validation_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: false + pin_memory: ${data.pin_memory} + num_workers: ${data.num_workers} + batch_size: ${data.test_batch_size} + +test_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: false + pin_memory: ${data.pin_memory} + num_workers: ${data.num_workers} + batch_size: ${data.test_batch_size} diff --git a/models/Mask3D/build/lib/mask3d/conf/data/data_loaders/simple_loader_save_memory.yaml b/models/Mask3D/build/lib/mask3d/conf/data/data_loaders/simple_loader_save_memory.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1b1b45d13167dc07357a13feb5a513dd71c9a2e --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/data_loaders/simple_loader_save_memory.yaml @@ -0,0 +1,22 @@ +# @package data + +train_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: true + pin_memory: ${data.pin_memory} + num_workers: ${data.num_workers} + batch_size: ${data.batch_size} + +validation_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: false + pin_memory: ${data.pin_memory} + num_workers: 1 + batch_size: ${data.test_batch_size} + +test_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: false + pin_memory: ${data.pin_memory} + num_workers: 1 + batch_size: ${data.test_batch_size} diff --git a/models/Mask3D/build/lib/mask3d/conf/data/datasets/matterport.yaml b/models/Mask3D/build/lib/mask3d/conf/data/datasets/matterport.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6728ab9eb26bc78f435237d9d7d61800b900735d --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/datasets/matterport.yaml @@ -0,0 +1,48 @@ +# @package data +train_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/matterport + image_augmentations_path: mix3d/conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: mix3d/conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/matterport/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +validation_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/matterport/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +test_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/matterport + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/matterport/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} diff --git a/models/Mask3D/build/lib/mask3d/conf/data/datasets/matterport_scannet.yaml b/models/Mask3D/build/lib/mask3d/conf/data/datasets/matterport_scannet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..df259ceaadfa68a90c2b8a60d7b74a958b30c79d --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/datasets/matterport_scannet.yaml @@ -0,0 +1,50 @@ +# @package data +train_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: + - data/processed/scannet + - data/processed/matterport + image_augmentations_path: mix3d/conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: mix3d/conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +validation_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +test_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} diff --git a/models/Mask3D/build/lib/mask3d/conf/data/datasets/rio.yaml b/models/Mask3D/build/lib/mask3d/conf/data/datasets/rio.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1adfea36fea05b14a7fa95382677aee6144d1b4b --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/datasets/rio.yaml @@ -0,0 +1,48 @@ +# @package data +train_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/rio + image_augmentations_path: mix3d/conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: mix3d/conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +validation_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/rio + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +test_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/rio + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} diff --git a/models/Mask3D/build/lib/mask3d/conf/data/datasets/s3dis.yaml b/models/Mask3D/build/lib/mask3d/conf/data/datasets/s3dis.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2e1385416655514397d82737e1edc2d1a5997657 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/datasets/s3dis.yaml @@ -0,0 +1,87 @@ +# @package data +train_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "s3dis" + data_dir: data/processed/s3dis + image_augmentations_path: conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/s3dis/label_database.yaml + color_mean_std: data/processed/s3dis/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + # different augs experiments + instance_oversampling: 0.0 + place_around_existing: False + point_per_cut: 0 + max_cut_region: 0 + flip_in_center: false + noise_rate: 0 + resample_points: 0 + cropping: ${data.cropping} + cropping_args: ${data.cropping_args} + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + filter_out_classes: [] + label_offset: 0 + +validation_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "s3dis" + data_dir: data/processed/s3dis + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/s3dis/label_database.yaml + color_mean_std: data/processed/s3dis/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + filter_out_classes: [] + label_offset: 0 + +test_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "s3dis" + data_dir: data/processed/s3dis + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/s3dis/label_database.yaml + color_mean_std: data/processed/s3dis/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + filter_out_classes: [] + label_offset: 0 diff --git a/models/Mask3D/build/lib/mask3d/conf/data/datasets/scannet.yaml b/models/Mask3D/build/lib/mask3d/conf/data/datasets/scannet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..50f1c6c5998d8f3c6dae35ef508225dff4b0271f --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/datasets/scannet.yaml @@ -0,0 +1,79 @@ +# @package data +train_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet" + data_dir: data/processed/scannet + image_augmentations_path: conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + # different augs experiments + instance_oversampling: 0.0 + place_around_existing: false + point_per_cut: 0 + max_cut_region: 0 + flip_in_center: false + noise_rate: 0 + resample_points: 0 + add_unlabeled_pc: false + cropping: ${data.cropping} + cropping_args: ${data.cropping_args} + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 1] + label_offset: 2 + +validation_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet" + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 1] + label_offset: 2 + +test_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet" + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 1] + label_offset: 2 diff --git a/models/Mask3D/build/lib/mask3d/conf/data/datasets/scannet200.yaml b/models/Mask3D/build/lib/mask3d/conf/data/datasets/scannet200.yaml new file mode 100644 index 0000000000000000000000000000000000000000..730a6ab9f1965004ec9828d1e8b2429005bef6f2 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/datasets/scannet200.yaml @@ -0,0 +1,79 @@ +# @package data +train_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet200" + data_dir: /home/weders/scratch/scratch/scannetter/arkit/raw/ + image_augmentations_path: conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: conf/augmentation/volumentations_aug.yaml + # label_db_filepath: data/processed/scannet200/label_database.yaml + # color_mean_std: data/processed/scannet200/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + # different augs experiments + instance_oversampling: 0.0 + place_around_existing: false + point_per_cut: 0 + max_cut_region: 0 + flip_in_center: false + noise_rate: 0 + resample_points: 0 + add_unlabeled_pc: false + cropping: ${data.cropping} + cropping_args: ${data.cropping_args} + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 2] + label_offset: 2 + +validation_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet200" + data_dir: /home/weders/scratch/scratch/scannetter/arkit/raw/ + image_augmentations_path: null + volume_augmentations_path: null + # label_db_filepath: data/processed/scannet200/label_database.yaml + # color_mean_std: data/processed/scannet200/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 2] + label_offset: 2 + +test_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet200" + data_dir: /home/weders/scratch/scratch/scannetter/arkit/raw/ + image_augmentations_path: null + volume_augmentations_path: null + # label_db_filepath: data/processed/scannet200/label_database.yaml + # color_mean_std: data/processed/scannet200/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 2] + label_offset: 2 diff --git a/models/Mask3D/build/lib/mask3d/conf/data/datasets/semantic_kitti.yaml b/models/Mask3D/build/lib/mask3d/conf/data/datasets/semantic_kitti.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9540ad610bd4a68d64369519d20e13009df9feda --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/datasets/semantic_kitti.yaml @@ -0,0 +1,42 @@ +# @package data +train_dataset: + _target_: mix3d.datasets.outdoor_semseg.LidarDataset + data_dir: data/processed/semantic_kitti + label_db_filepath: data/processed/semantic_kitti/label_database.yaml + mode: ${data.train_mode} + add_reflection: ${data.add_reflection} + add_distance: ${data.add_distance} + add_instance: ${data.add_instance} + num_labels: ${data.num_labels} + sweep: ${data.sweep} + data_percent: 1.0 + ignore_label: ${data.ignore_label} + volume_augmentations_path: mix3d/conf/augmentation/volumentations_aug.yaml + +validation_dataset: + _target_: mix3d.datasets.outdoor_semseg.LidarDataset + data_dir: data/processed/semantic_kitti + label_db_filepath: data/processed/semantic_kitti/label_database.yaml + mode: ${data.validation_mode} + add_reflection: ${data.add_reflection} + add_distance: ${data.add_distance} + add_instance: ${data.add_instance} + num_labels: ${data.num_labels} + sweep: ${data.sweep} + data_percent: 1.0 + ignore_label: ${data.ignore_label} + volume_augmentations_path: null + +test_dataset: + _target_: mix3d.datasets.outdoor_semseg.LidarDataset + data_dir: data/processed/semantic_kitti + label_db_filepath: data/processed/semantic_kitti/label_database.yaml + mode: ${data.test_mode} + add_reflection: ${data.add_reflection} + add_distance: ${data.add_distance} + add_instance: ${data.add_instance} + num_labels: ${data.num_labels} + sweep: ${data.sweep} + data_percent: 1.0 + ignore_label: ${data.ignore_label} + volume_augmentations_path: null diff --git a/models/Mask3D/build/lib/mask3d/conf/data/datasets/stpls3d.yaml b/models/Mask3D/build/lib/mask3d/conf/data/datasets/stpls3d.yaml new file mode 100644 index 0000000000000000000000000000000000000000..913667d4123a7edead9d948358ae25cf9f7b4bb1 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/datasets/stpls3d.yaml @@ -0,0 +1,95 @@ +# @package data +train_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "stpls3d" + data_dir: data/processed/stpls3d + image_augmentations_path: conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/stpls3d/label_database.yaml + color_mean_std: data/processed/stpls3d/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + # different augs experiments + instance_oversampling: 0.0 + place_around_existing: False + point_per_cut: 0 + max_cut_region: 0 + flip_in_center: false + noise_rate: 0 + resample_points: 0 + cropping: ${data.cropping} + cropping_args: ${data.cropping_args} + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + reps_per_epoch: ${general.reps_per_epoch} + eval_inner_core: ${general.eval_inner_core} + filter_out_classes: [0] + label_offset: 1 + is_elastic_distortion: true + color_drop: 0.0 + +validation_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "stpls3d" + data_dir: data/processed/stpls3d + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/stpls3d/label_database.yaml + color_mean_std: data/processed/stpls3d/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + on_crops: ${general.on_crops} + eval_inner_core: ${general.eval_inner_core} + filter_out_classes: [0] + label_offset: 1 + +test_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "stpls3d" + data_dir: data/processed/stpls3d + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/stpls3d/label_database.yaml + color_mean_std: data/processed/stpls3d/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + on_crops: ${general.on_crops} + eval_inner_core: ${general.eval_inner_core} + filter_out_classes: [0] + label_offset: 1 diff --git a/models/Mask3D/build/lib/mask3d/conf/data/indoor.yaml b/models/Mask3D/build/lib/mask3d/conf/data/indoor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..868c37ccfe901f14396b68a38eac47b42cb3e812 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/indoor.yaml @@ -0,0 +1,43 @@ +# @package _group_ + +# these parameters are inherited by datasets, data_loaders and collators +# but they might be overwritten + +# splits +train_mode: train +validation_mode: validation +test_mode: validation # test # validation + +# dataset +ignore_label: 255 +add_raw_coordinates: true # 3dim +add_colors: true # 3dim +add_normals: false # 3dim +in_channels: 3 # in_channels = 3 * (add_normals + add_colors + add_raw_coordinates) +num_labels: 20 +# num_labels: 41 +add_instance: ${general.add_instance} +task: ${general.task} + +# data loader +pin_memory: false +num_workers: 4 +batch_size: 5 +test_batch_size: 1 +cache_data: false + +# collation +voxel_size: 0.02 + +reps_per_epoch: ${general.reps_per_epoch} + +cropping: false +cropping_args: + min_points: 30000 + aspect: 0.8 + min_crop: 0.5 + max_crop: 1.0 + +crop_min_size: 20000 +crop_length: 6.0 +cropping_v1: true \ No newline at end of file diff --git a/models/Mask3D/build/lib/mask3d/conf/data/outdoor.yaml b/models/Mask3D/build/lib/mask3d/conf/data/outdoor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a77474f62d1cfb53f130160f641c65cb81a62956 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/data/outdoor.yaml @@ -0,0 +1,26 @@ +# @package _group_ + +# these parameters are inherited by datasets, data_loaders and collators +# but they might be overwritten + +# splits +train_mode: train +validation_mode: validation +test_mode: validation + +# dataset +ignore_label: 255 +add_distance: true # 1dim +add_reflection: true # 1dim +in_channels: 2 # in_channels = add_distance + add_reflection +num_labels: 19 +add_instance: false + +# data loader +pin_memory: true +num_workers: 4 +batch_size: 18 +sweep: 1 + +# collation +voxel_size: 0.15 diff --git a/models/Mask3D/build/lib/mask3d/conf/logging/base.yaml b/models/Mask3D/build/lib/mask3d/conf/logging/base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3d700a101ddf3d1e2c1a3cdea08190afff762a5b --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/logging/base.yaml @@ -0,0 +1,10 @@ +# @package _group_ +- _target_: pytorch_lightning.loggers.NeptuneLogger + project_name: ${general.workspace}/${general.project_name} + experiment_name: ${general.experiment_name} + offline_mode: false + +- _target_: pytorch_lightning.loggers.CSVLogger + save_dir: ${general.save_dir} + name: ${general.experiment_id} + version: ${general.version} diff --git a/models/Mask3D/build/lib/mask3d/conf/logging/full.yaml b/models/Mask3D/build/lib/mask3d/conf/logging/full.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b434e94dc1f0889cf0829b5f89b8509717a3546c --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/logging/full.yaml @@ -0,0 +1,8 @@ +# @package _group_ +- _target_: pytorch_lightning.loggers.WandbLogger + project: ${general.project_name} + name: ${general.experiment_name} + save_dir: ${general.save_dir} + entity: "schult" + resume: "allow" + id: ${general.experiment_name} diff --git a/models/Mask3D/build/lib/mask3d/conf/logging/minimal.yaml b/models/Mask3D/build/lib/mask3d/conf/logging/minimal.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1c46e26fefedcec50d4fdc9fc77c187d60cf7b9 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/logging/minimal.yaml @@ -0,0 +1,5 @@ +# @package _group_ +- _target_: pytorch_lightning.loggers.CSVLogger + save_dir: ${general.save_dir} + name: ${general.experiment_id} + version: ${general.version} diff --git a/models/Mask3D/build/lib/mask3d/conf/logging/offline.yaml b/models/Mask3D/build/lib/mask3d/conf/logging/offline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..914ad19142ca22c3778be709208323908460ebac --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/logging/offline.yaml @@ -0,0 +1,10 @@ +# @package _group_ +- _target_: pytorch_lightning.loggers.TensorBoardLogger + name: ${general.experiment_id} + version: ${general.version} + save_dir: ${general.save_dir} + +- _target_: pytorch_lightning.loggers.CSVLogger + name: ${general.experiment_id} + version: ${general.version} + save_dir: ${general.save_dir} \ No newline at end of file diff --git a/models/Mask3D/build/lib/mask3d/conf/loss/cross_entropy.yaml b/models/Mask3D/build/lib/mask3d/conf/loss/cross_entropy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c000f40ad2ab40605c244e38243a6e0cc7933768 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/loss/cross_entropy.yaml @@ -0,0 +1,3 @@ +# @package _group_ +_target_: torch.nn.CrossEntropyLoss +ignore_index: ${data.ignore_label} diff --git a/models/Mask3D/build/lib/mask3d/conf/loss/set_criterion.yaml b/models/Mask3D/build/lib/mask3d/conf/loss/set_criterion.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3c04ba49ce1823c2d6e923a03ae0514490d463e9 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/loss/set_criterion.yaml @@ -0,0 +1,11 @@ +# @package _group_ +_target_: mask3d.models.criterion.SetCriterion +num_classes: ${general.num_targets} +eos_coef: 0.1 +losses: + - "labels" + - "masks" +num_points: ${matcher.num_points} +oversample_ratio: 3.0 +importance_sample_ratio: 0.75 +class_weights: -1 diff --git a/models/Mask3D/build/lib/mask3d/conf/loss/set_criterion_custom_weights_1.yaml b/models/Mask3D/build/lib/mask3d/conf/loss/set_criterion_custom_weights_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1d2c308e081c1ffa61beb13308b27e6ff753f0f4 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/loss/set_criterion_custom_weights_1.yaml @@ -0,0 +1,11 @@ +# @package _group_ +_target_: mask3d.models.criterion.SetCriterion +num_classes: ${general.num_targets} +eos_coef: 0.1 +losses: + - "labels" + - "masks" +num_points: ${matcher.num_points} +oversample_ratio: 3.0 +importance_sample_ratio: 0.75 +class_weights: [1.0,1.5,10.0,1.0,1.0,1.0,1.0,1.0,10.0,10.0,1.0,10.0,1.0,1.0] diff --git a/models/Mask3D/build/lib/mask3d/conf/matcher/hungarian_matcher.yaml b/models/Mask3D/build/lib/mask3d/conf/matcher/hungarian_matcher.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47750b20906b6b40a131b702ba360e36ee4c8380 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/matcher/hungarian_matcher.yaml @@ -0,0 +1,6 @@ +# @package _group_ +_target_: mask3d.models.matcher.HungarianMatcher +cost_class: 2. +cost_mask: 5. +cost_dice: 2. +num_points: -1 diff --git a/models/Mask3D/build/lib/mask3d/conf/metrics/miou.yaml b/models/Mask3D/build/lib/mask3d/conf/metrics/miou.yaml new file mode 100644 index 0000000000000000000000000000000000000000..68d1b61181d9615d7d6d7638261d119a4fc47074 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/metrics/miou.yaml @@ -0,0 +1,4 @@ +# @package _group_ +_target_: mask3d.models.metrics.ConfusionMatrix +num_classes: ${data.num_labels} +ignore_label: ${data.ignore_label} diff --git a/models/Mask3D/build/lib/mask3d/conf/model/mask3d.yaml b/models/Mask3D/build/lib/mask3d/conf/model/mask3d.yaml new file mode 100644 index 0000000000000000000000000000000000000000..95718d8710477650561e0ddd845688f50c868032 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/model/mask3d.yaml @@ -0,0 +1,47 @@ +# @package _group_ +_target_: mask3d.models.Mask3D + +# transformer parameters +hidden_dim: 128 +dim_feedforward: 1024 +num_queries: 100 +num_heads: 8 +num_decoders: 3 +dropout: 0.0 +pre_norm: false +use_level_embed: false +normalize_pos_enc: true +positional_encoding_type: "fourier" +gauss_scale: 1.0 +hlevels: [0,1,2,3] + +# queries +non_parametric_queries: true +random_query_both: false +random_normal: false +random_queries: false +use_np_features: false + +# sampling +sample_sizes: [200, 800, 3200, 12800, 51200] +max_sample_size: false # change false means sampling activated + +shared_decoder: true +num_classes: ${general.num_targets} +train_on_segments: ${general.train_on_segments} +scatter_type: "mean" + +voxel_size: ${data.voxel_size} + +config: + backbone: + _target_: mask3d.models.Res16UNet34C + config: + dialations: [ 1, 1, 1, 1 ] + conv1_kernel_size: 5 + bn_momentum: 0.02 + # depends on normals, color, raw_coordinates + # varies from 3 to 9 + in_channels: ${data.in_channels} + out_channels: ${data.num_labels} + out_fpn: true diff --git a/models/Mask3D/build/lib/mask3d/conf/optimizer/adamw.yaml b/models/Mask3D/build/lib/mask3d/conf/optimizer/adamw.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4b4020d1ddd1444c94ea5bfbe1281c485fca587e --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/optimizer/adamw.yaml @@ -0,0 +1,3 @@ +# @package _group_ +_target_: torch.optim.AdamW +lr: 0.0001 \ No newline at end of file diff --git a/models/Mask3D/build/lib/mask3d/conf/optimizer/adamw_lower.yaml b/models/Mask3D/build/lib/mask3d/conf/optimizer/adamw_lower.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e42f091a0d5dd03b66ab1dcec8b81d78a692af9 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/optimizer/adamw_lower.yaml @@ -0,0 +1,3 @@ +# @package _group_ +_target_: torch.optim.AdamW +lr: 0.005 diff --git a/models/Mask3D/build/lib/mask3d/conf/scheduler/exponentiallr.yaml b/models/Mask3D/build/lib/mask3d/conf/scheduler/exponentiallr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc5224083670b286d75fda46304560dbcca3aecb --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/scheduler/exponentiallr.yaml @@ -0,0 +1,11 @@ +# @package _group_ + +scheduler: + _target_: torch.optim.lr_scheduler.ExponentialLR + gamma: 0.99999 + last_epoch: -1 # ${trainer.max_epochs} + # need to set to number because of tensorboard logger + # steps_per_epoch: -1 + +pytorch_lightning_params: + interval: step diff --git a/models/Mask3D/build/lib/mask3d/conf/scheduler/lambdalr.yaml b/models/Mask3D/build/lib/mask3d/conf/scheduler/lambdalr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b63f6f4333e98931ce22f1a38829de0ef51a3719 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/scheduler/lambdalr.yaml @@ -0,0 +1,8 @@ +# @package _group_ + +scheduler: + _target_: torch.optim.lr_scheduler.StepLR + step_size: 99999 + +pytorch_lightning_params: + interval: epoch diff --git a/models/Mask3D/build/lib/mask3d/conf/scheduler/onecyclelr.yaml b/models/Mask3D/build/lib/mask3d/conf/scheduler/onecyclelr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c788877193d7366c21088cf9fefb77e4f62ef4d9 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/scheduler/onecyclelr.yaml @@ -0,0 +1,11 @@ +# @package _group_ + +scheduler: + _target_: torch.optim.lr_scheduler.OneCycleLR + max_lr: ${optimizer.lr} + epochs: ${trainer.max_epochs} + # need to set to number because of tensorboard logger + steps_per_epoch: -1 + +pytorch_lightning_params: + interval: step diff --git a/models/Mask3D/build/lib/mask3d/conf/trainer/trainer.yaml b/models/Mask3D/build/lib/mask3d/conf/trainer/trainer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f436300f9ca6bbbe96ca6c1b4c7e8eeffe35fabd --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/trainer/trainer.yaml @@ -0,0 +1,7 @@ +# @package _group_ +deterministic: false +max_epochs: 1000 +min_epochs: 1 +resume_from_checkpoint: null +check_val_every_n_epoch: 50 +num_sanity_val_steps: -1 diff --git a/models/Mask3D/build/lib/mask3d/conf/trainer/trainer600.yaml b/models/Mask3D/build/lib/mask3d/conf/trainer/trainer600.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc9f00295aafe3431d1c0e7ca50dbc29559ea134 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/conf/trainer/trainer600.yaml @@ -0,0 +1,7 @@ +# @package _group_ +deterministic: false +max_epochs: 601 +min_epochs: 1 +resume_from_checkpoint: null +check_val_every_n_epoch: 50 +num_sanity_val_steps: 2 diff --git a/models/Mask3D/build/lib/mask3d/datasets/__init__.py b/models/Mask3D/build/lib/mask3d/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/datasets/outdoor_semseg.py b/models/Mask3D/build/lib/mask3d/datasets/outdoor_semseg.py new file mode 100644 index 0000000000000000000000000000000000000000..4592a6eda45c1a7626530eb19c42c267496749df --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/outdoor_semseg.py @@ -0,0 +1,206 @@ +import logging +from pathlib import Path +from typing import List, Optional, Union, Tuple +from random import random + +import numpy as np +import volumentations as V +import yaml +from torch.utils.data import Dataset + +logger = logging.getLogger(__name__) + + +class LidarDataset(Dataset): + def __init__( + self, + data_dir: Optional[ + Union[str, Tuple[str]] + ] = "data/processed/semantic_kitti", + label_db_filepath: Optional[ + str + ] = "./data/processed/semantic_kitti/label_database.yaml", + mode: Optional[str] = "train", + add_reflection: Optional[bool] = True, + add_distance: Optional[bool] = False, + add_instance: Optional[bool] = True, + num_labels: Optional[int] = -1, + data_percent: Optional[float] = 1.0, + ignore_label: Optional[Union[int, List[int]]] = 255, + volume_augmentations_path: Optional[str] = None, + sweep: Optional[int] = 1, + ): + self.mode = mode + self.data_dir = data_dir + if type(data_dir) == str: + self.data_dir = [self.data_dir] + self.ignore_label = ignore_label + self.add_instance = add_instance + self.add_distance = add_distance + self.add_reflection = add_reflection + + # loading database files + self._data = [] + for database_path in self.data_dir: + database_path = Path(database_path) + if not (database_path / f"{mode}_database.yaml").exists(): + print(f"generate {database_path}/{mode}_database.yaml first") + exit() + self._data.extend( + self._load_yaml(database_path / f"{mode}_database.yaml") + ) + + labels = self._load_yaml(Path(label_db_filepath)) + self._labels = self._select_correct_labels(labels, num_labels) + + # augmentations + self.volume_augmentations = V.NoOp() + if volume_augmentations_path is not None: + self.volume_augmentations = V.load( + volume_augmentations_path, data_format="yaml" + ) + + # reformulating in sweeps + data = [[]] + last_scene = self._data[0]["scene"] + for x in self._data: + if x["scene"] == last_scene: + data[-1].append(x) + else: + last_scene = x["scene"] + data.append([x]) + for i in range(len(data)): + data[i] = list(self.chunks(data[i], sweep)) + self._data = [val for sublist in data for val in sublist] + + if data_percent < 1.0: + self._data = self._data[: int(len(self._data) * data_percent)] + + @staticmethod + def chunks(lst, n): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i : i + n] + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx: int): + points = [] + for sweep in self.data[idx]: + points.append(np.load(sweep["filepath"])) + # rotate + points[-1][:, :3] = ( + points[-1][:, :3] @ np.array(sweep["pose"])[:3, :3] + ) + # translate + points[-1][:, :3] += np.array(sweep["pose"])[:3, 3] + points = np.vstack(points) + + coordinates, features, labels = ( + points[:, :3], + points[:, 3:-2], + points[:, -2:], + ) + + if not self.add_reflection: + features = np.ones(np.ones((len(coordinates), 1))) + + if self.add_distance: + center_coordinate = coordinates.mean(0) + features = np.hstack( + ( + features, + np.linalg.norm(coordinates - center_coordinate, axis=1)[ + :, np.newaxis + ], + ) + ) + + # volume and image augmentations for train + if "train" in self.mode: + coordinates -= coordinates.mean(0) + if 0.5 > random(): + coordinates += ( + np.random.uniform(coordinates.min(0), coordinates.max(0)) + / 2 + ) + aug = self.volume_augmentations( + points=coordinates, + features=features, + labels=labels, + ) + coordinates, features, labels = ( + aug["points"], + aug["features"], + aug["labels"], + ) + + # prepare labels and map from 0 to 20(40) + labels = labels.astype(np.int32) + if labels.size > 0: + labels[:, 0] = self._remap_from_zero(labels[:, 0]) + if not self.add_instance: + # taking only first column, which is segmentation label, not instance + labels = labels[:, 0].flatten() + + return coordinates, features, labels + + @property + def data(self): + """database file containing information about preproscessed dataset""" + return self._data + + @property + def label_info(self): + """database file containing information labels used by dataset""" + return self._labels + + @staticmethod + def _load_yaml(filepath): + with open(filepath) as f: + file = yaml.safe_load(f) + return file + + def _select_correct_labels(self, labels, num_labels): + number_of_validation_labels = 0 + number_of_all_labels = 0 + for ( + k, + v, + ) in labels.items(): + number_of_all_labels += 1 + if v["validation"]: + number_of_validation_labels += 1 + + if num_labels == number_of_all_labels: + return labels + elif num_labels == number_of_validation_labels: + valid_labels = dict() + for ( + k, + v, + ) in labels.items(): + if v["validation"]: + valid_labels.update({k: v}) + return valid_labels + else: + msg = f"""not available number labels, select from: + {number_of_validation_labels}, {number_of_all_labels}""" + raise ValueError(msg) + + def _remap_from_zero(self, labels): + labels[ + ~np.isin(labels, list(self.label_info.keys())) + ] = self.ignore_label + # remap to the range from 0 + for i, k in enumerate(self.label_info.keys()): + labels[labels == k] = i + return labels + + def _remap_model_output(self, output): + output = np.array(output) + output_remapped = output.copy() + for i, k in enumerate(self.label_info.keys()): + output_remapped[output == i] = k + return output_remapped diff --git a/models/Mask3D/build/lib/mask3d/datasets/preprocessing/__init__.py b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/datasets/preprocessing/arkitscenes_preprocessing.py b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/arkitscenes_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..2f222dc27e73eedab1e1d82b14c1573ce632af7c --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/arkitscenes_preprocessing.py @@ -0,0 +1,116 @@ +import re +from pathlib import Path +import numpy as np +import pandas as pd +from fire import Fire +from natsort import natsorted +from loguru import logger +import os + +from datasets.preprocessing.base_preprocessing import BasePreprocessing +from utils.point_cloud_utils import load_ply_with_normals + +from datasets.scannet200.scannet200_constants import ( + VALID_CLASS_IDS_200, + SCANNET_COLOR_MAP_200, + CLASS_LABELS_200, +) + + +class ARKitScenesPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "/home/weders/scratch/scratch/scannetter/arkit/raw", + save_dir: str = "/home/weders/scratch/scratch/scannetter/arkit/raw", + modes: tuple = ('Validation', ), + n_jobs: int = 1, + git_repo: str = "./data/raw/scannet/ScanNet", + mesh_file: str="mesh_tsdf.ply", + scannet200: bool = False, + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + self.scannet200 = scannet200 + git_repo = Path(git_repo) + for mode in self.modes: + scenes = os.listdir(os.path.join(data_dir, mode)) + scans_folder = "scans_test" if mode == "test" else "scans" + filepaths = [] + for scene in scenes: + if os.path.exists(os.path.join(data_dir, mode, scene, mesh_file)): + filepaths.append( + self.data_dir + / mode + / scene + / mesh_file) + self.files[mode] = natsorted(filepaths) + + def process_file(self, filepath, mode): + """process_file. + + Please note, that for obtaining segmentation labels ply files were used. + + Args: + filepath: path to the main ply file + mode: train, test or validation + + Returns: + filebase: info about file + """ + scene = int(filepath.parent.name) + print(scene) + filebase = { + "filepath": filepath, + "scene": scene, + "sub_scene": scene, + "raw_filepath": str(filepath), + "file_len": -1, + } + # reading both files and checking that they are fitting + coords, features, _ = load_ply_with_normals(filepath) + file_len = len(coords) + filebase["file_len"] = file_len + points = np.hstack((coords, features)) + + print(features.shape) + + points = np.concatenate((points, np.zeros((file_len, 4))), axis=1) # adding segment and label fake columns + + processed_filepath = ( + self.save_dir / mode / f"data_mask3d.npy" + ) + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + return filebase + + @logger.catch + def fix_bugs_in_labels(self): + if not self.scannet200: + logger.add(self.save_dir / "fixed_bugs_in_labels.log") + found_wrong_labels = { + tuple([270, 0]): 50, + tuple([270, 2]): 50, + tuple([384, 0]): 149, + } + for scene, wrong_label in found_wrong_labels.items(): + scene, sub_scene = scene + bug_file = ( + self.save_dir / "train" / f"{scene:04}_{sub_scene:02}.npy" + ) + points = np.load(bug_file) + bug_mask = points[:, -1] != wrong_label + points = points[bug_mask] + np.save(bug_file, points) + logger.info(f"Fixed {bug_file}") + + def _parse_scene_subscene(self, name): + scene_match = re.match(r"scene(\d{4})_(\d{2})", name) + print(scene_match) + return int(scene_match.group(1)), int(scene_match.group(2)) + + +if __name__ == "__main__": + Fire(ARKitScenesPreprocessing) \ No newline at end of file diff --git a/models/Mask3D/build/lib/mask3d/datasets/preprocessing/base_preprocessing.py b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/base_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..a17fd4f89aca0d16d27b1bd10c9f40b3e40a6e61 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/base_preprocessing.py @@ -0,0 +1,204 @@ +import os +import sys +import re +import yaml +import json +import multiprocessing +from pathlib import Path +from hashlib import md5 + +import numpy as np +from fire import Fire +from tqdm import tqdm +from joblib import Parallel, delayed +from loguru import logger + + +class BasePreprocessing: + def __init__( + self, + data_dir: str = "./data/raw/", + save_dir: str = "./data/processed/", + modes: tuple = ("train", "validation", "test"), + n_jobs: int = -1, + ): + self.data_dir = Path(data_dir) + self.save_dir = Path(save_dir) + self.n_jobs = n_jobs + self.modes = modes + + if not self.data_dir.exists(): + logger.error("data folder doesn't exist") + raise FileNotFoundError + if self.save_dir.exists() is False: + self.save_dir.mkdir(parents=True, exist_ok=True) + + self.files = {} + for data_type in self.modes: + self.files.update({data_type: []}) + + @logger.catch + def preprocess(self): + self.n_jobs = ( + multiprocessing.cpu_count() if self.n_jobs == -1 else self.n_jobs + ) + for mode in self.modes: + database = [] + logger.info(f"Tasks for {mode}: {len(self.files[mode])}") + parallel_results = Parallel(n_jobs=self.n_jobs, verbose=10)( + delayed(self.process_file)(file, mode) + for file in self.files[mode] + ) + for filebase in parallel_results: + database.append(filebase) + self.save_database(database, mode) + # self.fix_bugs_in_labels() + # self.joint_database() + # self.compute_color_mean_std( + # train_database_path=(self.save_dir / "train_database.yaml") + # ) + + def preprocess_sequential(self): + for mode in self.modes: + database = [] + for filepath in tqdm(self.files[mode], unit="file"): + filebase = self.process_file(filepath, mode) + database.append(filebase) + self.save_database(database, mode) + self.fix_bugs_in_labels() + self.joint_database() + self.compute_color_mean_std( + train_database_path=(self.save_dir / "train_database.yaml") + ) + + def process_file(self, filepath, mode): + """process_file. + + Args: + filepath: path to the main file + mode: typically train, test or validation + + Returns: + filebase: info about file + """ + raise NotImplementedError + + def make_instance_database_sequential( + self, + train_database_path: str = "./data/processed/train_database.yaml", + mode="instance", + ): + train_database = self._load_yaml(train_database_path) + instance_database = [] + for sample in tqdm(train_database): + instance_database.append(self.extract_instance_from_file(sample)) + self.save_database(instance_database, mode=mode) + + @logger.catch + def make_instance_database( + self, + train_database_path: str = "./data/processed/train_database.yaml", + mode="instance", + ): + self.n_jobs = ( + multiprocessing.cpu_count() if self.n_jobs == -1 else self.n_jobs + ) + train_database = self._load_yaml(train_database_path) + instance_database = [] + logger.info(f"Files in database: {len(train_database)}") + parallel_results = Parallel(n_jobs=self.n_jobs, verbose=10)( + delayed(self.extract_instance_from_file)(sample) + for sample in train_database + ) + for filebase in parallel_results: + instance_database.append(filebase) + self.save_database(instance_database, mode=mode) + + def extract_instance_from_file(self, sample_from_database): + points = np.load(sample_from_database["filepath"]) + labels = points[:, -2:] + file_instances = [] + for instance_id in np.unique(labels[:, 1]): + occupied_indices = np.isin(labels[:, 1], instance_id) + instance_points = points[occupied_indices].copy() + instance_classes = ( + np.unique(instance_points[:, 9]).astype(int).tolist() + ) + + hash_string = str(sample_from_database["filepath"]) + str( + instance_id + ) + hash_string = md5(hash_string.encode("utf-8")).hexdigest() + instance_filepath = ( + self.save_dir / "instances" / f"{hash_string}.npy" + ) + instance = { + "classes": instance_classes, + "instance_filepath": str(instance_filepath), + "instance_size": len(instance_points), + "original_file": str(sample_from_database["filepath"]), + } + if not instance_filepath.parent.exists(): + instance_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(instance_filepath, instance_points.astype(np.float32)) + file_instances.append(instance) + return file_instances + + def fix_bugs_in_labels(self): + pass + + def compute_color_mean_std( + self, + train_database_path: str = "./data/processed/train_database.yaml", + ): + pass + + def save_database(self, database, mode): + for element in database: + self._dict_to_yaml(element) + self._save_yaml(self.save_dir / (mode + "_database.yaml"), database) + + def joint_database(self, train_modes=["train", "validation"]): + joint_db = [] + for mode in train_modes: + joint_db.extend( + self._load_yaml(self.save_dir / (mode + "_database.yaml")) + ) + self._save_yaml( + self.save_dir / "train_validation_database.yaml", joint_db + ) + + @classmethod + def _read_json(cls, path): + with open(path) as f: + file = json.load(f) + return file + + @classmethod + def _save_yaml(cls, path, file): + with open(path, "w") as f: + yaml.safe_dump( + file, f, default_style=None, default_flow_style=False + ) + + @classmethod + def _dict_to_yaml(cls, dictionary): + if not isinstance(dictionary, dict): + return + for k, v in dictionary.items(): + if isinstance(v, dict): + cls._dict_to_yaml(v) + if isinstance(v, np.ndarray): + dictionary[k] = v.tolist() + if isinstance(v, Path): + dictionary[k] = str(v) + + @classmethod + def _load_yaml(cls, filepath): + with open(filepath) as f: + file = yaml.safe_load(f) + return file + + +if __name__ == "__main__": + Fire(BasePreprocessing) diff --git a/models/Mask3D/build/lib/mask3d/datasets/preprocessing/s3dis_preprocessing.py b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/s3dis_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..3e7ff4967ca9dc22248c6863b41f7b652687ae98 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/s3dis_preprocessing.py @@ -0,0 +1,282 @@ +import os +import re + +import numpy as np +from fire import Fire +from loguru import logger +from natsort import natsorted + +from datasets.preprocessing.base_preprocessing import BasePreprocessing + + +class S3DISPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "./data/raw/s3dis", + save_dir: str = "./data/processed/s3dis", + modes: tuple = ( + "Area_1", + "Area_2", + "Area_3", + "Area_4", + "Area_5", + "Area_6", + ), + n_jobs: int = -1, + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + self.class_map = { + "ceiling": 0, + "floor": 1, + "wall": 2, + "beam": 3, + "column": 4, + "window": 5, + "door": 6, + "table": 7, + "chair": 8, + "sofa": 9, + "bookcase": 10, + "board": 11, + "clutter": 12, + "stairs": 12, # stairs are also mapped to clutter + } + + self.color_map = [ + [0, 255, 0], # ceiling + [0, 0, 255], # floor + [0, 255, 255], # wall + [255, 255, 0], # beam + [255, 0, 255], # column + [100, 100, 255], # window + [200, 200, 100], # door + [170, 120, 200], # table + [255, 0, 0], # chair + [200, 100, 100], # sofa + [10, 200, 100], # bookcase + [200, 200, 200], # board + [50, 50, 50], + ] # clutter + + self.create_label_database() + + for mode in self.modes: + filepaths = [] + for scene_path in [ + f.path for f in os.scandir(self.data_dir / mode) if f.is_dir() + ]: + filepaths.append(scene_path) + self.files[mode] = natsorted(filepaths) + + def create_label_database(self): + label_database = dict() + for class_name, class_id in self.class_map.items(): + label_database[class_id] = { + "color": self.color_map[class_id], + "name": class_name, + "validation": True, + } + + self._save_yaml(self.save_dir / "label_database.yaml", label_database) + return label_database + + def _buf_count_newlines_gen(self, fname): + def _make_gen(reader): + while True: + b = reader(2**16) + if not b: + break + yield b + + with open(fname, "rb") as f: + count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read)) + return count + + def process_file(self, filepath, mode): + """process_file. + + Please note, that for obtaining segmentation labels ply files were used. + + Args: + filepath: path to the main ply file + mode: train, test or validation + + Returns: + filebase: info about file + """ + filebase = { + "filepath": filepath, + "scene": filepath.split("/")[-1], + "area": mode, + "raw_filepath": str(filepath), + "file_len": -1, + } + + scene_name = filepath.split("/")[-1] + instance_counter = 0 + scene_points = [] + for instance in [ + f + for f in os.scandir( + self.data_dir / mode / scene_name / "Annotations" + ) + if f.name.endswith(".txt") + ]: + instance_class = self.class_map[instance.name.split("_")[0]] + instance_points = np.loadtxt(instance.path) + + instance_normals = np.ones((instance_points.shape[0], 3)) + instance_class = np.array(instance_class).repeat( + instance_points.shape[0] + )[..., None] + instance_id = np.array(instance_counter).repeat( + instance_points.shape[0] + )[..., None] + + instance_points = np.hstack( + ( + instance_points, + instance_normals, + instance_class, + instance_id, + ) + ) + + scene_points.append(instance_points) + instance_counter += 1 + + points = np.vstack(scene_points) + + pcd_size = self._buf_count_newlines_gen(f"{filepath}/{scene_name}.txt") + if points.shape[0] != pcd_size: + print(f"FILE SIZE DOES NOT MATCH FOR {filepath}/{scene_name}.txt") + print(f"({points.shape[0]} vs. {pcd_size})") + + filebase["raw_segmentation_filepath"] = "" + + # add segment id as additional feature (DUMMY) + points = np.hstack((points, np.ones(points.shape[0])[..., None])) + points[:, [9, 10, -1]] = points[ + :, [-1, 9, 10] + ] # move segments after RGB + + gt_data = (points[:, -2] + 1) * 1000 + points[:, -1] + 1 + + file_len = len(points) + filebase["file_len"] = file_len + + processed_filepath = self.save_dir / mode / f"{scene_name}.npy" + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + processed_gt_filepath = ( + self.save_dir / "instance_gt" / mode / f"{scene_name}.txt" + ) + if not processed_gt_filepath.parent.exists(): + processed_gt_filepath.parent.mkdir(parents=True, exist_ok=True) + np.savetxt(processed_gt_filepath, gt_data.astype(np.int32), fmt="%d") + filebase["instance_gt_filepath"] = str(processed_gt_filepath) + + filebase["color_mean"] = [ + float((points[:, 3] / 255).mean()), + float((points[:, 4] / 255).mean()), + float((points[:, 5] / 255).mean()), + ] + filebase["color_std"] = [ + float(((points[:, 3] / 255) ** 2).mean()), + float(((points[:, 4] / 255) ** 2).mean()), + float(((points[:, 5] / 255) ** 2).mean()), + ] + return filebase + + def compute_color_mean_std(self, train_database_path: str = ""): + area_database_paths = [ + f + for f in os.scandir(self.save_dir) + if f.name.startswith("Area_") and f.name.endswith(".yaml") + ] + + for database_path in area_database_paths: + database = self._load_yaml(database_path.path) + color_mean, color_std = [], [] + for sample in database: + color_std.append(sample["color_std"]) + color_mean.append(sample["color_mean"]) + + color_mean = np.array(color_mean).mean(axis=0) + color_std = np.sqrt( + np.array(color_std).mean(axis=0) - color_mean**2 + ) + feats_mean_std = { + "mean": [float(each) for each in color_mean], + "std": [float(each) for each in color_std], + } + self._save_yaml( + self.save_dir / f"{database_path.name}_color_mean_std.yaml", + feats_mean_std, + ) + + for database_path in area_database_paths: + all_mean, all_std = [], [] + for let_out_path in area_database_paths: + if database_path == let_out_path: + continue + + database = self._load_yaml(let_out_path.path) + for sample in database: + all_std.append(sample["color_std"]) + all_mean.append(sample["color_mean"]) + + all_color_mean = np.array(all_mean).mean(axis=0) + all_color_std = np.sqrt( + np.array(all_std).mean(axis=0) - all_color_mean**2 + ) + feats_mean_std = { + "mean": [float(each) for each in all_color_mean], + "std": [float(each) for each in all_color_std], + } + file_path = database_path.name.replace("_database.yaml", "") + self._save_yaml( + self.save_dir / f"{file_path}_color_mean_std.yaml", + feats_mean_std, + ) + + @logger.catch + def fix_bugs_in_labels(self): + pass + + def joint_database( + self, + train_modes=( + "Area_1", + "Area_2", + "Area_3", + "Area_4", + "Area_5", + "Area_6", + ), + ): + for mode in train_modes: + joint_db = [] + for let_out in train_modes: + if mode == let_out: + continue + joint_db.extend( + self._load_yaml( + self.save_dir / (let_out + "_database.yaml") + ) + ) + self._save_yaml( + self.save_dir / f"train_{mode}_database.yaml", joint_db + ) + + def _parse_scene_subscene(self, name): + scene_match = re.match(r"scene(\d{4})_(\d{2})", name) + return int(scene_match.group(1)), int(scene_match.group(2)) + + +if __name__ == "__main__": + Fire(S3DISPreprocessing) diff --git a/models/Mask3D/build/lib/mask3d/datasets/preprocessing/scannet_preprocessing.py b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/scannet_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..5a981864612e04930b04c9c0df8aaa6e2d9249a3 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/scannet_preprocessing.py @@ -0,0 +1,296 @@ +import re +from pathlib import Path +import numpy as np +import pandas as pd +from fire import Fire +from natsort import natsorted +from loguru import logger + +from datasets.preprocessing.base_preprocessing import BasePreprocessing +from utils.point_cloud_utils import load_ply_with_normals + +from datasets.scannet200.scannet200_constants import ( + VALID_CLASS_IDS_200, + SCANNET_COLOR_MAP_200, + CLASS_LABELS_200, +) + + +class ScannetPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "./data/raw/scannet/scannet", + save_dir: str = "./data/processed/scannet", + modes: tuple = ("train", "validation", "test"), + n_jobs: int = -1, + git_repo: str = "./data/raw/scannet/ScanNet", + scannet200: bool = False, + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + self.scannet200 = scannet200 + + if self.scannet200: + self.labels_pd = pd.read_csv( + self.data_dir / "scannetv2-labels.combined.tsv", + sep="\t", + header=0, + ) + + git_repo = Path(git_repo) + self.create_label_database(git_repo) + for mode in self.modes: + trainval_split_dir = git_repo / "Tasks" / "Benchmark" + scannet_special_mode = "val" if mode == "validation" else mode + with open( + trainval_split_dir / (f"scannetv2_{scannet_special_mode}.txt") + ) as f: + # -1 because the last one is always empty + split_file = f.read().split("\n")[:-1] + + scans_folder = "scans_test" if mode == "test" else "scans" + filepaths = [] + for scene in split_file: + filepaths.append( + self.data_dir + / scans_folder + / scene + / (scene + "_vh_clean_2.ply") + ) + self.files[mode] = natsorted(filepaths) + + def create_label_database(self, git_repo): + if self.scannet200: + label_database = {} + for row_id, class_id in enumerate(VALID_CLASS_IDS_200): + label_database[class_id] = { + "color": SCANNET_COLOR_MAP_200[class_id], + "name": CLASS_LABELS_200[row_id], + "validation": True, + } + self._save_yaml( + self.save_dir / "label_database.yaml", label_database + ) + return label_database + else: + if (self.save_dir / "label_database.yaml").exists(): + return self._load_yaml(self.save_dir / "label_database.yaml") + df = pd.read_csv( + self.data_dir / "scannetv2-labels.combined.tsv", sep="\t" + ) + df = ( + df[~df[["nyu40class", "nyu40id"]].duplicated()][ + ["nyu40class", "nyu40id"] + ] + .set_index("nyu40id") + .sort_index()[["nyu40class"]] + .rename(columns={"nyu40class": "name"}) + .replace(" ", "_", regex=True) + ) + df = pd.DataFrame([{"name": "empty"}]).append(df) + df["validation"] = False + + with open( + git_repo + / "Tasks" + / "Benchmark" + / "classes_SemVoxLabel-nyu40id.txt" + ) as f: + for_validation = f.read().split("\n") + for category in for_validation: + index = int(re.split(" +", category)[0]) + df.loc[index, "validation"] = True + + # doing this hack because otherwise I will have to install imageio + with open(git_repo / "BenchmarkScripts" / "util.py") as f: + util = f.read() + color_list = eval("[" + util.split("return [\n")[1]) + + df["color"] = color_list + + label_database = df.to_dict("index") + self._save_yaml( + self.save_dir / "label_database.yaml", label_database + ) + return label_database + + def process_file(self, filepath, mode): + """process_file. + + Please note, that for obtaining segmentation labels ply files were used. + + Args: + filepath: path to the main ply file + mode: train, test or validation + + Returns: + filebase: info about file + """ + scene, sub_scene = self._parse_scene_subscene(filepath.name) + filebase = { + "filepath": filepath, + "scene": scene, + "sub_scene": sub_scene, + "raw_filepath": str(filepath), + "file_len": -1, + } + # reading both files and checking that they are fitting + coords, features, _ = load_ply_with_normals(filepath) + file_len = len(coords) + filebase["file_len"] = file_len + points = np.hstack((coords, features)) + + if mode in ["train", "validation"]: + # getting scene information + description_filepath = Path( + filepath + ).parent / filepath.name.replace("_vh_clean_2.ply", ".txt") + with open(description_filepath) as f: + scene_type = f.read().split("\n")[:-1] + scene_type = scene_type[-1].split(" = ")[1] + filebase["scene_type"] = scene_type + filebase["raw_description_filepath"] = description_filepath + + # getting instance info + instance_info_filepath = next( + Path(filepath).parent.glob("*.aggregation.json") + ) + segment_indexes_filepath = next( + Path(filepath).parent.glob("*[0-9].segs.json") + ) + instance_db = self._read_json(instance_info_filepath) + segments = self._read_json(segment_indexes_filepath) + segments = np.array(segments["segIndices"]) + filebase["raw_instance_filepath"] = instance_info_filepath + filebase["raw_segmentation_filepath"] = segment_indexes_filepath + + # add segment id as additional feature + segment_ids = np.unique(segments, return_inverse=True)[1] + points = np.hstack((points, segment_ids[..., None])) + + # reading labels file + label_filepath = filepath.parent / filepath.name.replace( + ".ply", ".labels.ply" + ) + filebase["raw_label_filepath"] = label_filepath + label_coords, label_colors, labels = load_ply_with_normals( + label_filepath + ) + if not np.allclose(coords, label_coords): + raise ValueError("files doesn't have same coordinates") + + # adding instance label + labels = labels[:, np.newaxis] + empty_instance_label = np.full(labels.shape, -1) + labels = np.hstack((labels, empty_instance_label)) + for instance in instance_db["segGroups"]: + segments_occupied = np.array(instance["segments"]) + occupied_indices = np.isin(segments, segments_occupied) + labels[occupied_indices, 1] = instance["id"] + + if self.scannet200: + label200 = instance["label"] + # Map the category name to id + label_ids = self.labels_pd[ + self.labels_pd["raw_category"] == label200 + ]["id"] + label_id = ( + int(label_ids.iloc[0]) if len(label_ids) > 0 else 0 + ) + labels[occupied_indices, 0] = label_id + points = np.hstack((points, labels)) + + # gt_data = (points[:, -2] + 1) * 1000 + points[:, -1] + 1 + gt_data = points[:, -2] * 1000 + points[:, -1] + 1 + else: + segments_test = "../../data/raw/scannet_test_segments" + segment_indexes_filepath = filepath.name.replace( + ".ply", ".0.010000.segs.json" + ) + segments = self._read_json( + f"{segments_test}/{segment_indexes_filepath}" + ) + segments = np.array(segments["segIndices"]) + # add segment id as additional feature + segment_ids = np.unique(segments, return_inverse=True)[1] + points = np.hstack((points, segment_ids[..., None])) + + processed_filepath = ( + self.save_dir / mode / f"{scene:04}_{sub_scene:02}.npy" + ) + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + if mode == "test": + return filebase + + processed_gt_filepath = ( + self.save_dir + / "instance_gt" + / mode + / f"scene{scene:04}_{sub_scene:02}.txt" + ) + if not processed_gt_filepath.parent.exists(): + processed_gt_filepath.parent.mkdir(parents=True, exist_ok=True) + np.savetxt(processed_gt_filepath, gt_data.astype(np.int32), fmt="%d") + filebase["instance_gt_filepath"] = str(processed_gt_filepath) + + filebase["color_mean"] = [ + float((features[:, 0] / 255).mean()), + float((features[:, 1] / 255).mean()), + float((features[:, 2] / 255).mean()), + ] + filebase["color_std"] = [ + float(((features[:, 0] / 255) ** 2).mean()), + float(((features[:, 1] / 255) ** 2).mean()), + float(((features[:, 2] / 255) ** 2).mean()), + ] + return filebase + + def compute_color_mean_std( + self, + train_database_path: str = "./data/processed/scannet/train_database.yaml", + ): + train_database = self._load_yaml(train_database_path) + color_mean, color_std = [], [] + for sample in train_database: + color_std.append(sample["color_std"]) + color_mean.append(sample["color_mean"]) + + color_mean = np.array(color_mean).mean(axis=0) + color_std = np.sqrt(np.array(color_std).mean(axis=0) - color_mean**2) + feats_mean_std = { + "mean": [float(each) for each in color_mean], + "std": [float(each) for each in color_std], + } + self._save_yaml(self.save_dir / "color_mean_std.yaml", feats_mean_std) + + @logger.catch + def fix_bugs_in_labels(self): + if not self.scannet200: + logger.add(self.save_dir / "fixed_bugs_in_labels.log") + found_wrong_labels = { + tuple([270, 0]): 50, + tuple([270, 2]): 50, + tuple([384, 0]): 149, + } + for scene, wrong_label in found_wrong_labels.items(): + scene, sub_scene = scene + bug_file = ( + self.save_dir / "train" / f"{scene:04}_{sub_scene:02}.npy" + ) + points = np.load(bug_file) + bug_mask = points[:, -1] != wrong_label + points = points[bug_mask] + np.save(bug_file, points) + logger.info(f"Fixed {bug_file}") + + def _parse_scene_subscene(self, name): + scene_match = re.match(r"scene(\d{4})_(\d{2})", name) + return int(scene_match.group(1)), int(scene_match.group(2)) + + +if __name__ == "__main__": + Fire(ScannetPreprocessing) diff --git a/models/Mask3D/build/lib/mask3d/datasets/preprocessing/semantic_kitti_preprocessing.py b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/semantic_kitti_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..d483e535435cca026588c3177cfe368fad99596b --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/semantic_kitti_preprocessing.py @@ -0,0 +1,181 @@ +import re +from pathlib import Path +from hashlib import md5 +from natsort import natsorted + +import numpy as np +from fire import Fire + +from base_preprocessing import BasePreprocessing + + +class SemanticKittiPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "./data/raw/semantic_kitti", + save_dir: str = "./data/processed/semantic_kitti", + modes: tuple = ("train", "validation", "test"), + n_jobs: int = -1, + git_repo: str = "./data/raw/semantic-kitti-api", + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + git_repo = Path(git_repo) + self.create_label_database(git_repo / "config" / "semantic-kitti.yaml") + self.config = self._load_yaml( + git_repo / "config" / "semantic-kitti.yaml" + ) + self.pose = dict() + + for mode in self.modes: + scene_mode = "valid" if mode == "validation" else mode + self.pose[mode] = dict() + for scene in sorted(self.config["split"][scene_mode]): + filepaths = list( + self.data_dir.glob(f"*/{scene:02}/velodyne/*bin") + ) + filepaths = [str(file) for file in filepaths] + self.files[mode].extend(natsorted(filepaths)) + calibration = parse_calibration( + Path(filepaths[0]).parent.parent / "calib.txt" + ) + self.pose[mode].update( + { + scene: parse_poses( + Path(filepaths[0]).parent.parent / "poses.txt", + calibration, + ), + } + ) + + def create_label_database(self, config_file): + if (self.save_dir / "label_database.yaml").exists(): + return self._load_yaml(self.save_dir / "label_database.yaml") + config = self._load_yaml(config_file) + label_database = {} + for key, old_key in config["learning_map_inv"].items(): + label_database.update( + { + key: { + "name": config["labels"][old_key], + # bgr -> rgb + "color": config["color_map"][old_key][::-1], + "validation": not config["learning_ignore"][key], + } + } + ) + + self._save_yaml(self.save_dir / "label_database.yaml", label_database) + return label_database + + def process_file(self, filepath, mode): + """process_file. + + Args: + filepath: path to the main ply file + mode: train, test + + Returns: + filebase: info about file + """ + scene, sub_scene = re.search(r"(\d{2}).*(\d{6})", filepath).group(1, 2) + filebase = { + "filepath": filepath, + "scene": int(scene), + "sub_scene": int(sub_scene), + "file_len": -1, + "pose": self.pose[mode][int(scene)][int(sub_scene)].tolist(), + } + + points = np.fromfile(filepath, dtype=np.float32).reshape(-1, 4) + file_len = len(points) + filebase["file_len"] = file_len + + if mode in ["train", "validation"]: + # getting label info + label_filepath = filepath.replace("velodyne", "labels").replace( + "bin", "label" + ) + filebase["label_filepath"] = label_filepath + label = np.fromfile(label_filepath, dtype=np.uint32).astype( + np.int32 + ) + if not points.shape[0] == label.shape[0]: + raise ValueError("Files do not have same length") + semantic_label = label & 0xFFFF + instance_label = label >> 16 + + semantic_label_copy = semantic_label.copy() + for label in np.unique(semantic_label): + semantic_label[semantic_label_copy == label] = self.config[ + "learning_map" + ][label] + + label = np.hstack( + (semantic_label[:, np.newaxis], instance_label[:, np.newaxis]) + ) + points = np.hstack((points, label)) + + processed_filepath = self.save_dir / mode / f"{scene}_{sub_scene}.npy" + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + return filebase + + +def parse_calibration(filename): + """read calibration file with given filename + Returns + ------- + dict + Calibration matrices as 4x4 numpy arrays. + """ + calib = {} + + with open(filename) as calib_file: + for line in calib_file: + key, content = line.strip().split(":") + values = [float(v) for v in content.strip().split()] + + pose = np.zeros((4, 4)) + pose[0, 0:4] = values[0:4] + pose[1, 0:4] = values[4:8] + pose[2, 0:4] = values[8:12] + pose[3, 3] = 1.0 + + calib[key] = pose + return calib + + +def parse_poses(filename, calibration): + """read poses file with per-scan poses from given filename + Returns + ------- + list + list of poses as 4x4 numpy arrays. + """ + + poses = [] + + Tr = calibration["Tr"] + Tr_inv = np.linalg.inv(Tr) + + with open(filename) as file: + for line in file: + values = [float(v) for v in line.strip().split()] + + pose = np.zeros((4, 4)) + pose[0, 0:4] = values[0:4] + pose[1, 0:4] = values[4:8] + pose[2, 0:4] = values[8:12] + pose[3, 3] = 1.0 + + poses.append(np.matmul(Tr_inv, np.matmul(pose, Tr))) + + return poses + + +if __name__ == "__main__": + Fire(SemanticKittiPreprocessing) diff --git a/models/Mask3D/build/lib/mask3d/datasets/preprocessing/stpls3d_preprocessing.py b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/stpls3d_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..63ed5bff5d52e656f4bad2f853e5973b433871bd --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/preprocessing/stpls3d_preprocessing.py @@ -0,0 +1,291 @@ +import re +import os +import numpy as np +from fire import Fire +from natsort import natsorted +from loguru import logger +import pandas as pd + +from datasets.preprocessing.base_preprocessing import BasePreprocessing + + +class STPLS3DPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "../../data/raw/stpls3d", + save_dir: str = "../../data/processed/stpls3d", + modes: tuple = ("train", "validation", "test"), + n_jobs: int = -1, + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + # https://github.com/meidachen/STPLS3D/blob/main/HAIS/STPLS3DInstanceSegmentationChallenge_Codalab_Evaluate.py#L31 + CLASS_LABELS = [ + "Build", + "LowVeg", + "MediumVeg", + "HighVeg", + "Vehicle", + "Truck", + "Aircraft", + "MilitaryVeh", + "Bike", + "Motorcycle", + "LightPole", + "StreetSign", + "Clutter", + "Fence", + ] + VALID_CLASS_IDS = np.array( + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] + ) + + self.class_map = { + "Ground": 0, + "Build": 1, + "LowVeg": 2, + "MediumVeg": 3, + "HighVeg": 4, + "Vehicle": 5, + "Truck": 6, + "Aircraft": 7, + "MilitaryVeh": 8, + "Bike": 9, + "Motorcycle": 10, + "LightPole": 11, + "StreetSign": 12, + "Clutter": 13, + "Fence": 14, + } + + self.color_map = [ + [0, 255, 0], # Ground + [0, 0, 255], # Build + [0, 255, 255], # LowVeg + [255, 255, 0], # MediumVeg + [255, 0, 255], # HiVeg + [100, 100, 255], # Vehicle + [200, 200, 100], # Truck + [170, 120, 200], # Aircraft + [255, 0, 0], # MilitaryVec + [200, 100, 100], # Bike + [10, 200, 100], # Motorcycle + [200, 200, 200], # LightPole + [50, 50, 50], # StreetSign + [60, 130, 60], # Clutter + [130, 30, 60], + ] # Fence + + self.create_label_database() + + for mode in self.modes: + filepaths = [] + for scene_path in [ + f.path for f in os.scandir(self.data_dir / mode) + ]: + filepaths.append(scene_path) + self.files[mode] = natsorted(filepaths) + + def create_label_database(self): + label_database = dict() + for class_name, class_id in self.class_map.items(): + label_database[class_id] = { + "color": self.color_map[class_id], + "name": class_name, + "validation": True, + } + + self._save_yaml(self.save_dir / "label_database.yaml", label_database) + return label_database + + def process_file(self, filepath, mode): + """process_file. + + Please note, that for obtaining segmentation labels ply files were used. + + Args: + filepath: path to the main ply file + mode: train, test or validation + + Returns: + filebase: info about file + """ + filebase = { + "filepath": filepath, + "scene": filepath.split("/")[-1], + "raw_filepath": str(filepath), + "file_len": -1, + } + + points = pd.read_csv(filepath, header=None).values + + filebase["raw_segmentation_filepath"] = "" + + # add segment id as additional feature (DUMMY) + if mode in ["train", "validation"]: + points = np.hstack( + ( + points, + np.ones(points.shape[0])[..., None], # normal 1 + np.ones(points.shape[0])[..., None], # normal 2 + np.ones(points.shape[0])[..., None], # normal 3 + np.ones(points.shape[0])[..., None], + ) + ) # segments + else: + # we need to add dummies for semantics and instances + points = np.hstack( + ( + points, + np.ones(points.shape[0])[..., None], # semantic class + np.ones(points.shape[0])[..., None], # instance id + np.ones(points.shape[0])[..., None], # normal 1 + np.ones(points.shape[0])[..., None], # normal 2 + np.ones(points.shape[0])[..., None], # normal 3 + np.ones(points.shape[0])[..., None], + ) + ) # segments + + points = points[ + :, [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 6, 7] + ] # move segments after RGB + + # move point clouds to be in positive range (important for split pointcloud function) + points[:, :3] = points[:, :3] - points[:, :3].min(0) + + points = points.astype(np.float32) + + if mode == "test": + points = points[:, :-2] + else: + points[ + points[:, -1] == -100.0, -1 + ] = -1 # -1 indicates "no instance" + + file_len = len(points) + filebase["file_len"] = file_len + + processed_filepath = ( + self.save_dir + / mode + / f"{filebase['scene'].replace('.txt', '')}.npy" + ) + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + if mode in ["validation", "test"]: + blocks = self.splitPointCloud(points) + + filebase["instance_gt_filepath"] = [] + filebase["filepath_crop"] = [] + for block_id, block in enumerate(blocks): + if len(block) > 10000: + if mode == "validation": + new_instance_ids = np.unique( + block[:, -1], return_inverse=True + )[1] + + assert new_instance_ids.shape[0] == block.shape[0] + # == 0 means -1 == no instance + # new_instance_ids[new_instance_ids == 0] + assert ( + new_instance_ids.max() < 1000 + ), "we cannot encode when there are more than 999 instances in a block" + + gt_data = (block[:, -2]) * 1000 + new_instance_ids + + processed_gt_filepath = ( + self.save_dir + / "instance_gt" + / mode + / f"{filebase['scene'].replace('.txt', '')}_{block_id}.txt" + ) + if not processed_gt_filepath.parent.exists(): + processed_gt_filepath.parent.mkdir( + parents=True, exist_ok=True + ) + np.savetxt( + processed_gt_filepath, + gt_data.astype(np.int32), + fmt="%d", + ) + filebase["instance_gt_filepath"].append( + str(processed_gt_filepath) + ) + + processed_filepath = ( + self.save_dir + / mode + / f"{filebase['scene'].replace('.txt', '')}_{block_id}.npy" + ) + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir( + parents=True, exist_ok=True + ) + np.save(processed_filepath, block.astype(np.float32)) + filebase["filepath_crop"].append(str(processed_filepath)) + else: + print("block was smaller than 1000 points") + assert False + + filebase["color_mean"] = [ + float((points[:, 3] / 255).mean()), + float((points[:, 4] / 255).mean()), + float((points[:, 5] / 255).mean()), + ] + filebase["color_std"] = [ + float(((points[:, 3] / 255) ** 2).mean()), + float(((points[:, 4] / 255) ** 2).mean()), + float(((points[:, 5] / 255) ** 2).mean()), + ] + return filebase + + def compute_color_mean_std( + self, + train_database_path: str = "./data/processed/stpls3d/train_database.yaml", + ): + train_database = self._load_yaml(train_database_path) + color_mean, color_std = [], [] + for sample in train_database: + color_std.append(sample["color_std"]) + color_mean.append(sample["color_mean"]) + + color_mean = np.array(color_mean).mean(axis=0) + color_std = np.sqrt(np.array(color_std).mean(axis=0) - color_mean**2) + feats_mean_std = { + "mean": [float(each) for each in color_mean], + "std": [float(each) for each in color_std], + } + self._save_yaml(self.save_dir / "color_mean_std.yaml", feats_mean_std) + + def splitPointCloud(self, cloud, size=50.0, stride=50): + limitMax = np.amax(cloud[:, 0:3], axis=0) + width = int(np.ceil((limitMax[0] - size) / stride)) + 1 + depth = int(np.ceil((limitMax[1] - size) / stride)) + 1 + cells = [ + (x * stride, y * stride) + for x in range(width) + for y in range(depth) + ] + blocks = [] + for (x, y) in cells: + xcond = (cloud[:, 0] <= x + size) & (cloud[:, 0] >= x) + ycond = (cloud[:, 1] <= y + size) & (cloud[:, 1] >= y) + cond = xcond & ycond + block = cloud[cond, :] + blocks.append(block) + return blocks + + @logger.catch + def fix_bugs_in_labels(self): + pass + + def _parse_scene_subscene(self, name): + scene_match = re.match(r"scene(\d{4})_(\d{2})", name) + return int(scene_match.group(1)), int(scene_match.group(2)) + + +if __name__ == "__main__": + Fire(STPLS3DPreprocessing) diff --git a/models/Mask3D/build/lib/mask3d/datasets/random_cuboid.py b/models/Mask3D/build/lib/mask3d/datasets/random_cuboid.py new file mode 100644 index 0000000000000000000000000000000000000000..334b87ecadbd9cbee2979d462532fb4a479b280f --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/random_cuboid.py @@ -0,0 +1,96 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +import torch + + +def check_aspect(crop_range, aspect_min): + xy_aspect = np.min(crop_range[:2]) / np.max(crop_range[:2]) + xz_aspect = np.min(crop_range[[0, 2]]) / np.max(crop_range[[0, 2]]) + yz_aspect = np.min(crop_range[1:]) / np.max(crop_range[1:]) + return ( + (xy_aspect >= aspect_min) + or (xz_aspect >= aspect_min) + or (yz_aspect >= aspect_min) + ) + + +class RandomCuboid(object): + """ + RandomCuboid augmentation from DepthContrast [https://arxiv.org/abs/2101.02691] + We slightly modify this operation to account for object detection. + This augmentation randomly crops a cuboid from the input and + ensures that the cropped cuboid contains at least one bounding box + """ + + def __init__( + self, + min_points, + # aspect=0.8, + crop_length=6.0, + version1=True, + ): + # self.aspect = aspect + self.crop_length = crop_length + self.min_points = min_points + self.version1 = version1 + + def __call__(self, point_cloud): + if point_cloud.shape[0] < self.min_points: + print("too small pcd") + return np.ones(point_cloud.shape[0], dtype=np.bool) + + range_xyz = np.max(point_cloud[:, :2], axis=0) - np.min( + point_cloud[:, :2], axis=0 + ) + + for _ in range(100): + # crop_range = self.min_crop + np.random.rand(3) * ( + # self.max_crop - self.min_crop + # ) + # crop_range[-1] = 999. + # if not check_aspect(crop_range, self.aspect): + # continue + + sample_center = point_cloud[:, :2].min(axis=0) + range_xyz / 2 + + if self.version1: + offset_x = np.random.uniform( + -range_xyz[0] / 4, range_xyz[0] / 4 + ) + offset_y = np.random.uniform( + -range_xyz[1] / 4, range_xyz[1] / 4 + ) + else: + offset_x = np.random.uniform( + -(range_xyz[0] / 2) + self.crop_length / 4, + +(range_xyz[0] / 2) - self.crop_length / 4, + ) + offset_y = np.random.uniform( + -(range_xyz[1] / 2) + self.crop_length / 4, + +(range_xyz[1] / 2) - self.crop_length / 4, + ) + + sample_center[0] = sample_center[0] + offset_x + sample_center[1] = sample_center[1] + offset_y + + min_xy = sample_center - self.crop_length / 2 + max_xy = sample_center + self.crop_length / 2 + + upper_idx = ( + np.sum((point_cloud[:, :2] <= max_xy).astype(np.int32), 1) == 2 + ) + lower_idx = ( + np.sum((point_cloud[:, :2] >= min_xy).astype(np.int32), 1) == 2 + ) + + new_pointidx = (upper_idx) & (lower_idx) + + if np.sum(new_pointidx) < self.min_points: + print("TOO SMALL") + continue + + return new_pointidx + + # fallback + print("FALLBACK") + return np.ones(point_cloud.shape[0], dtype=np.bool) diff --git a/models/Mask3D/build/lib/mask3d/datasets/scannet200/__init__.py b/models/Mask3D/build/lib/mask3d/datasets/scannet200/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/datasets/scannet200/scannet200_constants.py b/models/Mask3D/build/lib/mask3d/datasets/scannet200/scannet200_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..1d921407068335b82ad10af912d7e9d715dbd6ca --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/scannet200/scannet200_constants.py @@ -0,0 +1,704 @@ +### ScanNet Benchmark constants ### +VALID_CLASS_IDS_20 = ( + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 14, + 16, + 24, + 28, + 33, + 34, + 36, + 39, +) + +CLASS_LABELS_20 = ( + "wall", + "floor", + "cabinet", + "bed", + "chair", + "sofa", + "table", + "door", + "window", + "bookshelf", + "picture", + "counter", + "desk", + "curtain", + "refrigerator", + "shower curtain", + "toilet", + "sink", + "bathtub", + "otherfurniture", +) + +SCANNET_COLOR_MAP_20 = { + 0: (0.0, 0.0, 0.0), + 1: (174.0, 199.0, 232.0), + 2: (152.0, 223.0, 138.0), + 3: (31.0, 119.0, 180.0), + 4: (255.0, 187.0, 120.0), + 5: (188.0, 189.0, 34.0), + 6: (140.0, 86.0, 75.0), + 7: (255.0, 152.0, 150.0), + 8: (214.0, 39.0, 40.0), + 9: (197.0, 176.0, 213.0), + 10: (148.0, 103.0, 189.0), + 11: (196.0, 156.0, 148.0), + 12: (23.0, 190.0, 207.0), + 14: (247.0, 182.0, 210.0), + 15: (66.0, 188.0, 102.0), + 16: (219.0, 219.0, 141.0), + 17: (140.0, 57.0, 197.0), + 18: (202.0, 185.0, 52.0), + 19: (51.0, 176.0, 203.0), + 20: (200.0, 54.0, 131.0), + 21: (92.0, 193.0, 61.0), + 22: (78.0, 71.0, 183.0), + 23: (172.0, 114.0, 82.0), + 24: (255.0, 127.0, 14.0), + 25: (91.0, 163.0, 138.0), + 26: (153.0, 98.0, 156.0), + 27: (140.0, 153.0, 101.0), + 28: (158.0, 218.0, 229.0), + 29: (100.0, 125.0, 154.0), + 30: (178.0, 127.0, 135.0), + 32: (146.0, 111.0, 194.0), + 33: (44.0, 160.0, 44.0), + 34: (112.0, 128.0, 144.0), + 35: (96.0, 207.0, 209.0), + 36: (227.0, 119.0, 194.0), + 37: (213.0, 92.0, 176.0), + 38: (94.0, 106.0, 211.0), + 39: (82.0, 84.0, 163.0), + 40: (100.0, 85.0, 144.0), +} + +### ScanNet200 Benchmark constants ### +VALID_CLASS_IDS_200 = ( + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 21, + 22, + 23, + 24, + 26, + 27, + 28, + 29, + 31, + 32, + 33, + 34, + 35, + 36, + 38, + 39, + 40, + 41, + 42, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 54, + 55, + 56, + 57, + 58, + 59, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 82, + 84, + 86, + 87, + 88, + 89, + 90, + 93, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 110, + 112, + 115, + 116, + 118, + 120, + 121, + 122, + 125, + 128, + 130, + 131, + 132, + 134, + 136, + 138, + 139, + 140, + 141, + 145, + 148, + 154, + 155, + 156, + 157, + 159, + 161, + 163, + 165, + 166, + 168, + 169, + 170, + 177, + 180, + 185, + 188, + 191, + 193, + 195, + 202, + 208, + 213, + 214, + 221, + 229, + 230, + 232, + 233, + 242, + 250, + 261, + 264, + 276, + 283, + 286, + 300, + 304, + 312, + 323, + 325, + 331, + 342, + 356, + 370, + 392, + 395, + 399, + 408, + 417, + 488, + 540, + 562, + 570, + 572, + 581, + 609, + 748, + 776, + 1156, + 1163, + 1164, + 1165, + 1166, + 1167, + 1168, + 1169, + 1170, + 1171, + 1172, + 1173, + 1174, + 1175, + 1176, + 1178, + 1179, + 1180, + 1181, + 1182, + 1183, + 1184, + 1185, + 1186, + 1187, + 1188, + 1189, + 1190, + 1191, +) + +CLASS_LABELS_200 = ( + "wall", + "chair", + "floor", + "table", + "door", + "couch", + "cabinet", + "shelf", + "desk", + "office chair", + "bed", + "pillow", + "sink", + "picture", + "window", + "toilet", + "bookshelf", + "monitor", + "curtain", + "book", + "armchair", + "coffee table", + "box", + "refrigerator", + "lamp", + "kitchen cabinet", + "towel", + "clothes", + "tv", + "nightstand", + "counter", + "dresser", + "stool", + "cushion", + "plant", + "ceiling", + "bathtub", + "end table", + "dining table", + "keyboard", + "bag", + "backpack", + "toilet paper", + "printer", + "tv stand", + "whiteboard", + "blanket", + "shower curtain", + "trash can", + "closet", + "stairs", + "microwave", + "stove", + "shoe", + "computer tower", + "bottle", + "bin", + "ottoman", + "bench", + "board", + "washing machine", + "mirror", + "copier", + "basket", + "sofa chair", + "file cabinet", + "fan", + "laptop", + "shower", + "paper", + "person", + "paper towel dispenser", + "oven", + "blinds", + "rack", + "plate", + "blackboard", + "piano", + "suitcase", + "rail", + "radiator", + "recycling bin", + "container", + "wardrobe", + "soap dispenser", + "telephone", + "bucket", + "clock", + "stand", + "light", + "laundry basket", + "pipe", + "clothes dryer", + "guitar", + "toilet paper holder", + "seat", + "speaker", + "column", + "bicycle", + "ladder", + "bathroom stall", + "shower wall", + "cup", + "jacket", + "storage bin", + "coffee maker", + "dishwasher", + "paper towel roll", + "machine", + "mat", + "windowsill", + "bar", + "toaster", + "bulletin board", + "ironing board", + "fireplace", + "soap dish", + "kitchen counter", + "doorframe", + "toilet paper dispenser", + "mini fridge", + "fire extinguisher", + "ball", + "hat", + "shower curtain rod", + "water cooler", + "paper cutter", + "tray", + "shower door", + "pillar", + "ledge", + "toaster oven", + "mouse", + "toilet seat cover dispenser", + "furniture", + "cart", + "storage container", + "scale", + "tissue box", + "light switch", + "crate", + "power outlet", + "decoration", + "sign", + "projector", + "closet door", + "vacuum cleaner", + "candle", + "plunger", + "stuffed animal", + "headphones", + "dish rack", + "broom", + "guitar case", + "range hood", + "dustpan", + "hair dryer", + "water bottle", + "handicap bar", + "purse", + "vent", + "shower floor", + "water pitcher", + "mailbox", + "bowl", + "paper bag", + "alarm clock", + "music stand", + "projector screen", + "divider", + "laundry detergent", + "bathroom counter", + "object", + "bathroom vanity", + "closet wall", + "laundry hamper", + "bathroom stall door", + "ceiling light", + "trash bin", + "dumbbell", + "stair rail", + "tube", + "bathroom cabinet", + "cd case", + "closet rod", + "coffee kettle", + "structure", + "shower head", + "keyboard piano", + "case of water bottles", + "coat rack", + "storage organizer", + "folded chair", + "fire alarm", + "power strip", + "calendar", + "poster", + "potted plant", + "luggage", + "mattress", +) + +SCANNET_COLOR_MAP_200 = { + 0: (0.0, 0.0, 0.0), + 1: (174.0, 199.0, 232.0), + 2: (188.0, 189.0, 34.0), + 3: (152.0, 223.0, 138.0), + 4: (255.0, 152.0, 150.0), + 5: (214.0, 39.0, 40.0), + 6: (91.0, 135.0, 229.0), + 7: (31.0, 119.0, 180.0), + 8: (229.0, 91.0, 104.0), + 9: (247.0, 182.0, 210.0), + 10: (91.0, 229.0, 110.0), + 11: (255.0, 187.0, 120.0), + 13: (141.0, 91.0, 229.0), + 14: (112.0, 128.0, 144.0), + 15: (196.0, 156.0, 148.0), + 16: (197.0, 176.0, 213.0), + 17: (44.0, 160.0, 44.0), + 18: (148.0, 103.0, 189.0), + 19: (229.0, 91.0, 223.0), + 21: (219.0, 219.0, 141.0), + 22: (192.0, 229.0, 91.0), + 23: (88.0, 218.0, 137.0), + 24: (58.0, 98.0, 137.0), + 26: (177.0, 82.0, 239.0), + 27: (255.0, 127.0, 14.0), + 28: (237.0, 204.0, 37.0), + 29: (41.0, 206.0, 32.0), + 31: (62.0, 143.0, 148.0), + 32: (34.0, 14.0, 130.0), + 33: (143.0, 45.0, 115.0), + 34: (137.0, 63.0, 14.0), + 35: (23.0, 190.0, 207.0), + 36: (16.0, 212.0, 139.0), + 38: (90.0, 119.0, 201.0), + 39: (125.0, 30.0, 141.0), + 40: (150.0, 53.0, 56.0), + 41: (186.0, 197.0, 62.0), + 42: (227.0, 119.0, 194.0), + 44: (38.0, 100.0, 128.0), + 45: (120.0, 31.0, 243.0), + 46: (154.0, 59.0, 103.0), + 47: (169.0, 137.0, 78.0), + 48: (143.0, 245.0, 111.0), + 49: (37.0, 230.0, 205.0), + 50: (14.0, 16.0, 155.0), + 51: (196.0, 51.0, 182.0), + 52: (237.0, 80.0, 38.0), + 54: (138.0, 175.0, 62.0), + 55: (158.0, 218.0, 229.0), + 56: (38.0, 96.0, 167.0), + 57: (190.0, 77.0, 246.0), + 58: (208.0, 49.0, 84.0), + 59: (208.0, 193.0, 72.0), + 62: (55.0, 220.0, 57.0), + 63: (10.0, 125.0, 140.0), + 64: (76.0, 38.0, 202.0), + 65: (191.0, 28.0, 135.0), + 66: (211.0, 120.0, 42.0), + 67: (118.0, 174.0, 76.0), + 68: (17.0, 242.0, 171.0), + 69: (20.0, 65.0, 247.0), + 70: (208.0, 61.0, 222.0), + 71: (162.0, 62.0, 60.0), + 72: (210.0, 235.0, 62.0), + 73: (45.0, 152.0, 72.0), + 74: (35.0, 107.0, 149.0), + 75: (160.0, 89.0, 237.0), + 76: (227.0, 56.0, 125.0), + 77: (169.0, 143.0, 81.0), + 78: (42.0, 143.0, 20.0), + 79: (25.0, 160.0, 151.0), + 80: (82.0, 75.0, 227.0), + 82: (253.0, 59.0, 222.0), + 84: (240.0, 130.0, 89.0), + 86: (123.0, 172.0, 47.0), + 87: (71.0, 194.0, 133.0), + 88: (24.0, 94.0, 205.0), + 89: (134.0, 16.0, 179.0), + 90: (159.0, 32.0, 52.0), + 93: (213.0, 208.0, 88.0), + 95: (64.0, 158.0, 70.0), + 96: (18.0, 163.0, 194.0), + 97: (65.0, 29.0, 153.0), + 98: (177.0, 10.0, 109.0), + 99: (152.0, 83.0, 7.0), + 100: (83.0, 175.0, 30.0), + 101: (18.0, 199.0, 153.0), + 102: (61.0, 81.0, 208.0), + 103: (213.0, 85.0, 216.0), + 104: (170.0, 53.0, 42.0), + 105: (161.0, 192.0, 38.0), + 106: (23.0, 241.0, 91.0), + 107: (12.0, 103.0, 170.0), + 110: (151.0, 41.0, 245.0), + 112: (133.0, 51.0, 80.0), + 115: (184.0, 162.0, 91.0), + 116: (50.0, 138.0, 38.0), + 118: (31.0, 237.0, 236.0), + 120: (39.0, 19.0, 208.0), + 121: (223.0, 27.0, 180.0), + 122: (254.0, 141.0, 85.0), + 125: (97.0, 144.0, 39.0), + 128: (106.0, 231.0, 176.0), + 130: (12.0, 61.0, 162.0), + 131: (124.0, 66.0, 140.0), + 132: (137.0, 66.0, 73.0), + 134: (250.0, 253.0, 26.0), + 136: (55.0, 191.0, 73.0), + 138: (60.0, 126.0, 146.0), + 139: (153.0, 108.0, 234.0), + 140: (184.0, 58.0, 125.0), + 141: (135.0, 84.0, 14.0), + 145: (139.0, 248.0, 91.0), + 148: (53.0, 200.0, 172.0), + 154: (63.0, 69.0, 134.0), + 155: (190.0, 75.0, 186.0), + 156: (127.0, 63.0, 52.0), + 157: (141.0, 182.0, 25.0), + 159: (56.0, 144.0, 89.0), + 161: (64.0, 160.0, 250.0), + 163: (182.0, 86.0, 245.0), + 165: (139.0, 18.0, 53.0), + 166: (134.0, 120.0, 54.0), + 168: (49.0, 165.0, 42.0), + 169: (51.0, 128.0, 133.0), + 170: (44.0, 21.0, 163.0), + 177: (232.0, 93.0, 193.0), + 180: (176.0, 102.0, 54.0), + 185: (116.0, 217.0, 17.0), + 188: (54.0, 209.0, 150.0), + 191: (60.0, 99.0, 204.0), + 193: (129.0, 43.0, 144.0), + 195: (252.0, 100.0, 106.0), + 202: (187.0, 196.0, 73.0), + 208: (13.0, 158.0, 40.0), + 213: (52.0, 122.0, 152.0), + 214: (128.0, 76.0, 202.0), + 221: (187.0, 50.0, 115.0), + 229: (180.0, 141.0, 71.0), + 230: (77.0, 208.0, 35.0), + 232: (72.0, 183.0, 168.0), + 233: (97.0, 99.0, 203.0), + 242: (172.0, 22.0, 158.0), + 250: (155.0, 64.0, 40.0), + 261: (118.0, 159.0, 30.0), + 264: (69.0, 252.0, 148.0), + 276: (45.0, 103.0, 173.0), + 283: (111.0, 38.0, 149.0), + 286: (184.0, 9.0, 49.0), + 300: (188.0, 174.0, 67.0), + 304: (53.0, 206.0, 53.0), + 312: (97.0, 235.0, 252.0), + 323: (66.0, 32.0, 182.0), + 325: (236.0, 114.0, 195.0), + 331: (241.0, 154.0, 83.0), + 342: (133.0, 240.0, 52.0), + 356: (16.0, 205.0, 144.0), + 370: (75.0, 101.0, 198.0), + 392: (237.0, 95.0, 251.0), + 395: (191.0, 52.0, 49.0), + 399: (227.0, 254.0, 54.0), + 408: (49.0, 206.0, 87.0), + 417: (48.0, 113.0, 150.0), + 488: (125.0, 73.0, 182.0), + 540: (229.0, 32.0, 114.0), + 562: (158.0, 119.0, 28.0), + 570: (60.0, 205.0, 27.0), + 572: (18.0, 215.0, 201.0), + 581: (79.0, 76.0, 153.0), + 609: (134.0, 13.0, 116.0), + 748: (192.0, 97.0, 63.0), + 776: (108.0, 163.0, 18.0), + 1156: (95.0, 220.0, 156.0), + 1163: (98.0, 141.0, 208.0), + 1164: (144.0, 19.0, 193.0), + 1165: (166.0, 36.0, 57.0), + 1166: (212.0, 202.0, 34.0), + 1167: (23.0, 206.0, 34.0), + 1168: (91.0, 211.0, 236.0), + 1169: (79.0, 55.0, 137.0), + 1170: (182.0, 19.0, 117.0), + 1171: (134.0, 76.0, 14.0), + 1172: (87.0, 185.0, 28.0), + 1173: (82.0, 224.0, 187.0), + 1174: (92.0, 110.0, 214.0), + 1175: (168.0, 80.0, 171.0), + 1176: (197.0, 63.0, 51.0), + 1178: (175.0, 199.0, 77.0), + 1179: (62.0, 180.0, 98.0), + 1180: (8.0, 91.0, 150.0), + 1181: (77.0, 15.0, 130.0), + 1182: (154.0, 65.0, 96.0), + 1183: (197.0, 152.0, 11.0), + 1184: (59.0, 155.0, 45.0), + 1185: (12.0, 147.0, 145.0), + 1186: (54.0, 35.0, 219.0), + 1187: (210.0, 73.0, 181.0), + 1188: (221.0, 124.0, 77.0), + 1189: (149.0, 214.0, 66.0), + 1190: (72.0, 185.0, 134.0), + 1191: (42.0, 94.0, 198.0), +} + +### For instance segmentation the non-object categories ### +VALID_PANOPTIC_IDS = (1, 3) + +CLASS_LABELS_PANOPTIC = ("wall", "floor") diff --git a/models/Mask3D/build/lib/mask3d/datasets/scannet200/scannet200_splits.py b/models/Mask3D/build/lib/mask3d/datasets/scannet200/scannet200_splits.py new file mode 100644 index 0000000000000000000000000000000000000000..3a5585f70319d1eb061669bd82bbf3d64d0bca7b --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/scannet200/scannet200_splits.py @@ -0,0 +1,625 @@ +### This file contains the HEAD - COMMON - TAIL split category ids for ScanNet 200 + +HEAD_CATS_SCANNET_200 = [ + "tv stand", + "curtain", + "blinds", + "shower curtain", + "bookshelf", + "tv", + "kitchen cabinet", + "pillow", + "lamp", + "dresser", + "monitor", + "object", + "ceiling", + "board", + "stove", + "closet wall", + "couch", + "office chair", + "kitchen counter", + "shower", + "closet", + "doorframe", + "sofa chair", + "mailbox", + "nightstand", + "washing machine", + "picture", + "book", + "sink", + "recycling bin", + "table", + "backpack", + "shower wall", + "toilet", + "copier", + "counter", + "stool", + "refrigerator", + "window", + "file cabinet", + "chair", + "wall", + "plant", + "coffee table", + "stairs", + "armchair", + "cabinet", + "bathroom vanity", + "bathroom stall", + "mirror", + "blackboard", + "trash can", + "stair rail", + "box", + "towel", + "door", + "clothes", + "whiteboard", + "bed", + "floor", + "bathtub", + "desk", + "wardrobe", + "clothes dryer", + "radiator", + "shelf", +] +COMMON_CATS_SCANNET_200 = [ + "cushion", + "end table", + "dining table", + "keyboard", + "bag", + "toilet paper", + "printer", + "blanket", + "microwave", + "shoe", + "computer tower", + "bottle", + "bin", + "ottoman", + "bench", + "basket", + "fan", + "laptop", + "person", + "paper towel dispenser", + "oven", + "rack", + "piano", + "suitcase", + "rail", + "container", + "telephone", + "stand", + "light", + "laundry basket", + "pipe", + "seat", + "column", + "bicycle", + "ladder", + "jacket", + "storage bin", + "coffee maker", + "dishwasher", + "machine", + "mat", + "windowsill", + "bulletin board", + "fireplace", + "mini fridge", + "water cooler", + "shower door", + "pillar", + "ledge", + "furniture", + "cart", + "decoration", + "closet door", + "vacuum cleaner", + "dish rack", + "range hood", + "projector screen", + "divider", + "bathroom counter", + "laundry hamper", + "bathroom stall door", + "ceiling light", + "trash bin", + "bathroom cabinet", + "structure", + "storage organizer", + "potted plant", + "mattress", +] +TAIL_CATS_SCANNET_200 = [ + "paper", + "plate", + "soap dispenser", + "bucket", + "clock", + "guitar", + "toilet paper holder", + "speaker", + "cup", + "paper towel roll", + "bar", + "toaster", + "ironing board", + "soap dish", + "toilet paper dispenser", + "fire extinguisher", + "ball", + "hat", + "shower curtain rod", + "paper cutter", + "tray", + "toaster oven", + "mouse", + "toilet seat cover dispenser", + "storage container", + "scale", + "tissue box", + "light switch", + "crate", + "power outlet", + "sign", + "projector", + "candle", + "plunger", + "stuffed animal", + "headphones", + "broom", + "guitar case", + "dustpan", + "hair dryer", + "water bottle", + "handicap bar", + "purse", + "vent", + "shower floor", + "water pitcher", + "bowl", + "paper bag", + "alarm clock", + "music stand", + "laundry detergent", + "dumbbell", + "tube", + "cd case", + "closet rod", + "coffee kettle", + "shower head", + "keyboard piano", + "case of water bottles", + "coat rack", + "folded chair", + "fire alarm", + "power strip", + "calendar", + "poster", + "luggage", +] + + +### Given the different size of the official train and val sets, not all ScanNet200 categories are present in the validation set. +### Here we list of categories with labels and IDs present in both train and validation set, and the remaining categories those are present in train, but not in val +### We dont evaluate on unseen validation categories in this benchmark + +VALID_CLASS_IDS_200_VALIDATION = ( + "wall", + "chair", + "floor", + "table", + "door", + "couch", + "cabinet", + "shelf", + "desk", + "office chair", + "bed", + "pillow", + "sink", + "picture", + "window", + "toilet", + "bookshelf", + "monitor", + "curtain", + "book", + "armchair", + "coffee table", + "box", + "refrigerator", + "lamp", + "kitchen cabinet", + "towel", + "clothes", + "tv", + "nightstand", + "counter", + "dresser", + "stool", + "cushion", + "plant", + "ceiling", + "bathtub", + "end table", + "dining table", + "keyboard", + "bag", + "backpack", + "toilet paper", + "printer", + "tv stand", + "whiteboard", + "blanket", + "shower curtain", + "trash can", + "closet", + "stairs", + "microwave", + "stove", + "shoe", + "computer tower", + "bottle", + "bin", + "ottoman", + "bench", + "board", + "washing machine", + "mirror", + "copier", + "basket", + "sofa chair", + "file cabinet", + "fan", + "laptop", + "shower", + "paper", + "person", + "paper towel dispenser", + "oven", + "blinds", + "rack", + "plate", + "blackboard", + "piano", + "suitcase", + "rail", + "radiator", + "recycling bin", + "container", + "wardrobe", + "soap dispenser", + "telephone", + "bucket", + "clock", + "stand", + "light", + "laundry basket", + "pipe", + "clothes dryer", + "guitar", + "toilet paper holder", + "seat", + "speaker", + "column", + "ladder", + "bathroom stall", + "shower wall", + "cup", + "jacket", + "storage bin", + "coffee maker", + "dishwasher", + "paper towel roll", + "machine", + "mat", + "windowsill", + "bar", + "toaster", + "bulletin board", + "ironing board", + "fireplace", + "soap dish", + "kitchen counter", + "doorframe", + "toilet paper dispenser", + "mini fridge", + "fire extinguisher", + "ball", + "hat", + "shower curtain rod", + "water cooler", + "paper cutter", + "tray", + "shower door", + "pillar", + "ledge", + "toaster oven", + "mouse", + "toilet seat cover dispenser", + "furniture", + "cart", + "scale", + "tissue box", + "light switch", + "crate", + "power outlet", + "decoration", + "sign", + "projector", + "closet door", + "vacuum cleaner", + "plunger", + "stuffed animal", + "headphones", + "dish rack", + "broom", + "range hood", + "dustpan", + "hair dryer", + "water bottle", + "handicap bar", + "vent", + "shower floor", + "water pitcher", + "mailbox", + "bowl", + "paper bag", + "projector screen", + "divider", + "laundry detergent", + "bathroom counter", + "object", + "bathroom vanity", + "closet wall", + "laundry hamper", + "bathroom stall door", + "ceiling light", + "trash bin", + "dumbbell", + "stair rail", + "tube", + "bathroom cabinet", + "closet rod", + "coffee kettle", + "shower head", + "keyboard piano", + "case of water bottles", + "coat rack", + "folded chair", + "fire alarm", + "power strip", + "calendar", + "poster", + "potted plant", + "mattress", +) + +CLASS_LABELS_200_VALIDATION = ( + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 21, + 22, + 23, + 24, + 26, + 27, + 28, + 29, + 31, + 32, + 33, + 34, + 35, + 36, + 38, + 39, + 40, + 41, + 42, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 54, + 55, + 56, + 57, + 58, + 59, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 82, + 84, + 86, + 87, + 88, + 89, + 90, + 93, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 110, + 112, + 115, + 116, + 118, + 120, + 122, + 125, + 128, + 130, + 131, + 132, + 134, + 136, + 138, + 139, + 140, + 141, + 145, + 148, + 154, + 155, + 156, + 157, + 159, + 161, + 163, + 165, + 166, + 168, + 169, + 170, + 177, + 180, + 185, + 188, + 191, + 193, + 195, + 202, + 208, + 213, + 214, + 229, + 230, + 232, + 233, + 242, + 250, + 261, + 264, + 276, + 283, + 300, + 304, + 312, + 323, + 325, + 342, + 356, + 370, + 392, + 395, + 408, + 417, + 488, + 540, + 562, + 570, + 609, + 748, + 776, + 1156, + 1163, + 1164, + 1165, + 1166, + 1167, + 1168, + 1169, + 1170, + 1171, + 1172, + 1173, + 1175, + 1176, + 1179, + 1180, + 1181, + 1182, + 1184, + 1185, + 1186, + 1187, + 1188, + 1189, + 1191, +) + +VALID_CLASS_IDS_200_TRAIN_ONLY = ( + "bicycle", + "storage container", + "candle", + "guitar case", + "purse", + "alarm clock", + "music stand", + "cd case", + "structure", + "storage organizer", + "luggage", +) + +CLASS_LABELS_200_TRAIN_ONLY = ( + 121, + 221, + 286, + 331, + 399, + 572, + 581, + 1174, + 1178, + 1183, + 1190, +) diff --git a/models/Mask3D/build/lib/mask3d/datasets/semseg.py b/models/Mask3D/build/lib/mask3d/datasets/semseg.py new file mode 100644 index 0000000000000000000000000000000000000000..a848b1a20e4690971bf16790fcea00ade84441c0 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/semseg.py @@ -0,0 +1,993 @@ +import logging +from itertools import product +from pathlib import Path +from random import random, sample, uniform +from typing import List, Optional, Tuple, Union +from random import choice +from copy import deepcopy +from random import randrange + + +import numpy +import torch +from datasets.random_cuboid import RandomCuboid + +import albumentations as A +import numpy as np +import scipy +import volumentations as V +import yaml + +# from yaml import CLoader as Loader +from torch.utils.data import Dataset +from datasets.scannet200.scannet200_constants import ( + SCANNET_COLOR_MAP_200, + SCANNET_COLOR_MAP_20, +) + +logger = logging.getLogger(__name__) + + +class SemanticSegmentationDataset(Dataset): + """Docstring for SemanticSegmentationDataset.""" + + def __init__( + self, + dataset_name="scannet", + data_dir: Optional[Union[str, Tuple[str]]] = "data/processed/scannet", + label_db_filepath: Optional[ + str + ] = "configs/scannet_preprocessing/label_database.yaml", + # mean std values from scannet + color_mean_std: Optional[Union[str, Tuple[Tuple[float]]]] = ( + (0.47793125906962, 0.4303257521323044, 0.3749598901421883), + (0.2834475483823543, 0.27566157565723015, 0.27018971370874995), + ), + mode: Optional[str] = "train", + add_colors: Optional[bool] = True, + add_normals: Optional[bool] = True, + add_raw_coordinates: Optional[bool] = False, + add_instance: Optional[bool] = False, + num_labels: Optional[int] = -1, + data_percent: Optional[float] = 1.0, + ignore_label: Optional[Union[int, Tuple[int]]] = 255, + volume_augmentations_path: Optional[str] = None, + image_augmentations_path: Optional[str] = None, + instance_oversampling=0, + place_around_existing=False, + max_cut_region=0, + point_per_cut=100, + flip_in_center=False, + noise_rate=0.0, + resample_points=0.0, + cache_data=False, + add_unlabeled_pc=False, + task="instance_segmentation", + cropping=False, + cropping_args=None, + is_tta=False, + crop_min_size=20000, + crop_length=6.0, + cropping_v1=True, + reps_per_epoch=1, + area=-1, + on_crops=False, + eval_inner_core=-1, + filter_out_classes=[], + label_offset=0, + add_clip=False, + is_elastic_distortion=True, + color_drop=0.0, + ): + assert task in [ + "instance_segmentation", + "semantic_segmentation", + ], "unknown task" + + self.add_clip = add_clip + self.dataset_name = dataset_name + self.is_elastic_distortion = is_elastic_distortion + self.color_drop = color_drop + + if self.dataset_name == "scannet": + self.color_map = SCANNET_COLOR_MAP_20 + self.color_map[255] = (255, 255, 255) + elif self.dataset_name == "stpls3d": + self.color_map = { + 0: [0, 255, 0], # Ground + 1: [0, 0, 255], # Build + 2: [0, 255, 255], # LowVeg + 3: [255, 255, 0], # MediumVeg + 4: [255, 0, 255], # HiVeg + 5: [100, 100, 255], # Vehicle + 6: [200, 200, 100], # Truck + 7: [170, 120, 200], # Aircraft + 8: [255, 0, 0], # MilitaryVec + 9: [200, 100, 100], # Bike + 10: [10, 200, 100], # Motorcycle + 11: [200, 200, 200], # LightPole + 12: [50, 50, 50], # StreetSign + 13: [60, 130, 60], # Clutter + 14: [130, 30, 60], + } # Fence + elif self.dataset_name == "scannet200": + self.color_map = SCANNET_COLOR_MAP_200 + elif self.dataset_name == "s3dis": + self.color_map = { + 0: [0, 255, 0], # ceiling + 1: [0, 0, 255], # floor + 2: [0, 255, 255], # wall + 3: [255, 255, 0], # beam + 4: [255, 0, 255], # column + 5: [100, 100, 255], # window + 6: [200, 200, 100], # door + 7: [170, 120, 200], # table + 8: [255, 0, 0], # chair + 9: [200, 100, 100], # sofa + 10: [10, 200, 100], # bookcase + 11: [200, 200, 200], # board + 12: [50, 50, 50], # clutter + } + else: + assert False, "dataset not known" + + self.task = task + + self.filter_out_classes = filter_out_classes + self.label_offset = label_offset + + self.area = area + self.eval_inner_core = eval_inner_core + + self.reps_per_epoch = reps_per_epoch + + self.cropping = cropping + self.cropping_args = cropping_args + self.is_tta = is_tta + self.on_crops = on_crops + + self.crop_min_size = crop_min_size + self.crop_length = crop_length + + self.version1 = cropping_v1 + + self.random_cuboid = RandomCuboid( + self.crop_min_size, + crop_length=self.crop_length, + version1=self.version1, + ) + + self.mode = mode + self.data_dir = data_dir + self.add_unlabeled_pc = add_unlabeled_pc + if add_unlabeled_pc: + self.other_database = self._load_yaml( + Path(data_dir).parent / "matterport" / "train_database.yaml" + ) + if type(data_dir) == str: + self.data_dir = [self.data_dir] + self.ignore_label = ignore_label + self.add_colors = add_colors + self.add_normals = add_normals + self.add_instance = add_instance + self.add_raw_coordinates = add_raw_coordinates + self.instance_oversampling = instance_oversampling + self.place_around_existing = place_around_existing + self.max_cut_region = max_cut_region + self.point_per_cut = point_per_cut + self.flip_in_center = flip_in_center + self.noise_rate = noise_rate + self.resample_points = resample_points + + # loading database files + self._data = [] + for database_path in self.data_dir: + database_path = Path(database_path) + mode = 'Validation' + if self.dataset_name != "s3dis": + if not (database_path / f"{mode}_database.yaml").exists(): + print( + f"generate {database_path}/{mode}_database.yaml first" + ) + exit() + self._data.extend( + self._load_yaml(database_path / f"{mode}_database.yaml") + ) + else: + # mode_s3dis = f"Area_{self.area}" + mode_s3dis = "Validation" + if self.mode == "train": + mode_s3dis = "train_" + mode_s3dis + if not ( + database_path / f"{mode_s3dis}_database.yaml" + ).exists(): + print( + f"generate {database_path}/{mode_s3dis}_database.yaml first" + ) + exit() + self._data.extend( + self._load_yaml( + database_path / f"{mode_s3dis}_database.yaml" + ) + ) + if data_percent < 1.0: + self._data = sample( + self._data, int(len(self._data) * data_percent) + ) + # labels = self._load_yaml(Path(label_db_filepath)) + + # if working only on classes for validation - discard others + # self._labels = self._select_correct_labels(labels, num_labels) + + if instance_oversampling > 0: + self.instance_data = self._load_yaml( + Path(label_db_filepath).parent / "instance_database.yaml" + ) + + # normalize color channels + if self.dataset_name == "s3dis": + color_mean_std = color_mean_std.replace( + "color_mean_std.yaml", f"Area_{self.area}_color_mean_std.yaml" + ) + + if Path(str(color_mean_std)).exists(): + color_mean_std = self._load_yaml(color_mean_std) + color_mean, color_std = ( + tuple(color_mean_std["mean"]), + tuple(color_mean_std["std"]), + ) + elif len(color_mean_std[0]) == 3 and len(color_mean_std[1]) == 3: + color_mean, color_std = color_mean_std[0], color_mean_std[1] + else: + logger.error( + "pass mean and std as tuple of tuples, or as an .yaml file" + ) + + # augmentations + self.volume_augmentations = V.NoOp() + if (volume_augmentations_path is not None) and ( + volume_augmentations_path != "none" + ): + self.volume_augmentations = V.load( + Path(volume_augmentations_path), data_format="yaml" + ) + self.image_augmentations = A.NoOp() + if (image_augmentations_path is not None) and ( + image_augmentations_path != "none" + ): + self.image_augmentations = A.load( + Path(image_augmentations_path), data_format="yaml" + ) + # mandatory color augmentation + if add_colors: + self.normalize_color = A.Normalize(mean=color_mean, std=color_std) + + self.cache_data = cache_data + # new_data = [] + if self.cache_data: + new_data = [] + for i in range(len(self._data)): + self._data[i]["data"] = np.load( + self.data[i]["filepath"].replace("../../", "") + ) + if self.on_crops: + if self.eval_inner_core == -1: + for block_id, block in enumerate( + self.splitPointCloud(self._data[i]["data"]) + ): + if len(block) > 10000: + new_data.append( + { + "instance_gt_filepath": self._data[i][ + "instance_gt_filepath" + ][block_id] + if len( + self._data[i][ + "instance_gt_filepath" + ] + ) + > 0 + else list(), + "scene": f"{self._data[i]['scene'].replace('.txt', '')}_{block_id}.txt", + "raw_filepath": f"{self.data[i]['filepath'].replace('.npy', '')}_{block_id}", + "data": block, + } + ) + else: + assert False + else: + conds_inner, blocks_outer = self.splitPointCloud( + self._data[i]["data"], + size=self.crop_length, + inner_core=self.eval_inner_core, + ) + + for block_id in range(len(conds_inner)): + cond_inner = conds_inner[block_id] + block_outer = blocks_outer[block_id] + + if cond_inner.sum() > 10000: + new_data.append( + { + "instance_gt_filepath": self._data[i][ + "instance_gt_filepath" + ][block_id] + if len( + self._data[i][ + "instance_gt_filepath" + ] + ) + > 0 + else list(), + "scene": f"{self._data[i]['scene'].replace('.txt', '')}_{block_id}.txt", + "raw_filepath": f"{self.data[i]['filepath'].replace('.npy', '')}_{block_id}", + "data": block_outer, + "cond_inner": cond_inner, + } + ) + else: + assert False + + if self.on_crops: + self._data = new_data + # new_data.append(np.load(self.data[i]["filepath"].replace("../../", ""))) + # self._data = new_data + + def splitPointCloud(self, cloud, size=50.0, stride=50, inner_core=-1): + if inner_core == -1: + limitMax = np.amax(cloud[:, 0:3], axis=0) + width = int(np.ceil((limitMax[0] - size) / stride)) + 1 + depth = int(np.ceil((limitMax[1] - size) / stride)) + 1 + cells = [ + (x * stride, y * stride) + for x in range(width) + for y in range(depth) + ] + blocks = [] + for (x, y) in cells: + xcond = (cloud[:, 0] <= x + size) & (cloud[:, 0] >= x) + ycond = (cloud[:, 1] <= y + size) & (cloud[:, 1] >= y) + cond = xcond & ycond + block = cloud[cond, :] + blocks.append(block) + return blocks + else: + limitMax = np.amax(cloud[:, 0:3], axis=0) + width = int(np.ceil((limitMax[0] - inner_core) / stride)) + 1 + depth = int(np.ceil((limitMax[1] - inner_core) / stride)) + 1 + cells = [ + (x * stride, y * stride) + for x in range(width) + for y in range(depth) + ] + blocks_outer = [] + conds_inner = [] + for (x, y) in cells: + xcond_outer = ( + cloud[:, 0] <= x + inner_core / 2.0 + size / 2 + ) & (cloud[:, 0] >= x + inner_core / 2.0 - size / 2) + ycond_outer = ( + cloud[:, 1] <= y + inner_core / 2.0 + size / 2 + ) & (cloud[:, 1] >= y + inner_core / 2.0 - size / 2) + + cond_outer = xcond_outer & ycond_outer + block_outer = cloud[cond_outer, :] + + xcond_inner = (block_outer[:, 0] <= x + inner_core) & ( + block_outer[:, 0] >= x + ) + ycond_inner = (block_outer[:, 1] <= y + inner_core) & ( + block_outer[:, 1] >= y + ) + + cond_inner = xcond_inner & ycond_inner + + conds_inner.append(cond_inner) + blocks_outer.append(block_outer) + return conds_inner, blocks_outer + + def map2color(self, labels): + output_colors = list() + + for label in labels: + output_colors.append(self.color_map[label]) + + return torch.tensor(output_colors) + + def __len__(self): + if self.is_tta: + return 5 * len(self.data) + else: + return self.reps_per_epoch * len(self.data) + + def __getitem__(self, idx: int): + idx = idx % len(self.data) + if self.is_tta: + idx = idx % len(self.data) + + if self.cache_data: + points = self.data[idx]["data"] + else: + assert not self.on_crops, "you need caching if on crops" + points = np.load(self.data[idx]["filepath"].replace("../../", "")) + + if "train" in self.mode and self.dataset_name in ["s3dis", "stpls3d"]: + inds = self.random_cuboid(points) + points = points[inds] + + coordinates, color, normals, segments, labels = ( + points[:, :3], + points[:, 3:6], + points[:, 6:9], + points[:, 9], + points[:, 10:12], + ) + + raw_coordinates = coordinates.copy() + raw_color = color + raw_normals = normals + + if not self.add_colors: + color = np.ones((len(color), 3)) + + # volume and image augmentations for train + if "train" in self.mode or self.is_tta: + if self.cropping: + new_idx = self.random_cuboid( + coordinates, + labels[:, 1], + self._remap_from_zero(labels[:, 0].copy()), + ) + + coordinates = coordinates[new_idx] + color = color[new_idx] + labels = labels[new_idx] + segments = segments[new_idx] + raw_color = raw_color[new_idx] + raw_normals = raw_normals[new_idx] + normals = normals[new_idx] + points = points[new_idx] + + coordinates -= coordinates.mean(0) + + try: + coordinates += ( + np.random.uniform(coordinates.min(0), coordinates.max(0)) + / 2 + ) + except OverflowError as err: + print(coordinates) + print(coordinates.shape) + raise err + + if self.instance_oversampling > 0.0: + ( + coordinates, + color, + normals, + labels, + ) = self.augment_individual_instance( + coordinates, + color, + normals, + labels, + self.instance_oversampling, + ) + + if self.flip_in_center: + coordinates = flip_in_center(coordinates) + + for i in (0, 1): + if random() < 0.5: + coord_max = np.max(points[:, i]) + coordinates[:, i] = coord_max - coordinates[:, i] + + if random() < 0.95: + if self.is_elastic_distortion: + for granularity, magnitude in ((0.2, 0.4), (0.8, 1.6)): + coordinates = elastic_distortion( + coordinates, granularity, magnitude + ) + aug = self.volume_augmentations( + points=coordinates, + normals=normals, + features=color, + labels=labels, + ) + coordinates, color, normals, labels = ( + aug["points"], + aug["features"], + aug["normals"], + aug["labels"], + ) + pseudo_image = color.astype(np.uint8)[np.newaxis, :, :] + color = np.squeeze( + self.image_augmentations(image=pseudo_image)["image"] + ) + + if self.point_per_cut != 0: + number_of_cuts = int(len(coordinates) / self.point_per_cut) + for _ in range(number_of_cuts): + size_of_cut = np.random.uniform(0.05, self.max_cut_region) + # not wall, floor or empty + point = choice(coordinates) + x_min = point[0] - size_of_cut + x_max = x_min + size_of_cut + y_min = point[1] - size_of_cut + y_max = y_min + size_of_cut + z_min = point[2] - size_of_cut + z_max = z_min + size_of_cut + indexes = crop( + coordinates, x_min, y_min, z_min, x_max, y_max, z_max + ) + coordinates, normals, color, labels = ( + coordinates[~indexes], + normals[~indexes], + color[~indexes], + labels[~indexes], + ) + + # if self.noise_rate > 0: + # coordinates, color, normals, labels = random_points( + # coordinates, + # color, + # normals, + # labels, + # self.noise_rate, + # self.ignore_label, + # ) + + if (self.resample_points > 0) or (self.noise_rate > 0): + coordinates, color, normals, labels = random_around_points( + coordinates, + color, + normals, + labels, + self.resample_points, + self.noise_rate, + self.ignore_label, + ) + + if self.add_unlabeled_pc: + if random() < 0.8: + new_points = np.load( + self.other_database[ + np.random.randint(0, len(self.other_database) - 1) + ]["filepath"] + ) + ( + unlabeled_coords, + unlabeled_color, + unlabeled_normals, + unlabeled_labels, + ) = ( + new_points[:, :3], + new_points[:, 3:6], + new_points[:, 6:9], + new_points[:, 9:], + ) + unlabeled_coords -= unlabeled_coords.mean(0) + unlabeled_coords += ( + np.random.uniform( + unlabeled_coords.min(0), unlabeled_coords.max(0) + ) + / 2 + ) + + aug = self.volume_augmentations( + points=unlabeled_coords, + normals=unlabeled_normals, + features=unlabeled_color, + labels=unlabeled_labels, + ) + ( + unlabeled_coords, + unlabeled_color, + unlabeled_normals, + unlabeled_labels, + ) = ( + aug["points"], + aug["features"], + aug["normals"], + aug["labels"], + ) + pseudo_image = unlabeled_color.astype(np.uint8)[ + np.newaxis, :, : + ] + unlabeled_color = np.squeeze( + self.image_augmentations(image=pseudo_image)["image"] + ) + + coordinates = np.concatenate( + (coordinates, unlabeled_coords) + ) + color = np.concatenate((color, unlabeled_color)) + normals = np.concatenate((normals, unlabeled_normals)) + labels = np.concatenate( + ( + labels, + np.full_like(unlabeled_labels, self.ignore_label), + ) + ) + + if random() < self.color_drop: + color[:] = 255 + + # normalize color information + pseudo_image = color.astype(np.uint8)[np.newaxis, :, :] + color = np.squeeze(self.normalize_color(image=pseudo_image)["image"]) + + # prepare labels and map from 0 to 20(40) + labels = labels.astype(np.int32) + # if labels.size > 0: + # labels[:, 0] = self._remap_from_zero(labels[:, 0]) + # if not self.add_instance: + # # taking only first column, which is segmentation label, not instance + # labels = labels[:, 0].flatten()[..., None] + + labels = np.hstack((labels, segments[..., None].astype(np.int32))) + + features = color + if self.add_normals: + features = np.hstack((features, normals)) + if self.add_raw_coordinates: + if len(features.shape) == 1: + features = np.hstack((features[None, ...], coordinates)) + else: + features = np.hstack((features, coordinates)) + + # if self.task != "semantic_segmentation": + if self.data[idx]["raw_filepath"].split("/")[-2] in [ + "scene0636_00", + "scene0154_00", + ]: + return self.__getitem__(0) + + if self.dataset_name == "s3dis": + return ( + coordinates, + features, + labels, + self.data[idx]["area"] + "_" + self.data[idx]["scene"], + raw_color, + raw_normals, + raw_coordinates, + idx, + ) + if self.dataset_name == "stpls3d": + if labels.shape[1] != 1: # only segments --> test set! + if np.unique(labels[:, -2]).shape[0] < 2: + print("NO INSTANCES") + return self.__getitem__(0) + return ( + coordinates, + features, + labels, + self.data[idx]["scene"], + raw_color, + raw_normals, + raw_coordinates, + idx, + ) + else: + return ( + coordinates, + features, + labels, + self.data[idx]["raw_filepath"].split("/")[-2], + raw_color, + raw_normals, + raw_coordinates, + idx, + ) + + @property + def data(self): + """database file containing information about preproscessed dataset""" + return self._data + + @property + def label_info(self): + """database file containing information labels used by dataset""" + return self._labels + + @staticmethod + def _load_yaml(filepath): + with open(filepath) as f: + # file = yaml.load(f, Loader=Loader) + file = yaml.load(f) + return file + + def _select_correct_labels(self, labels, num_labels): + number_of_validation_labels = 0 + number_of_all_labels = 0 + for ( + k, + v, + ) in labels.items(): + number_of_all_labels += 1 + if v["validation"]: + number_of_validation_labels += 1 + + if num_labels == number_of_all_labels: + return labels + elif num_labels == number_of_validation_labels: + valid_labels = dict() + for ( + k, + v, + ) in labels.items(): + if v["validation"]: + valid_labels.update({k: v}) + return valid_labels + else: + msg = f"""not available number labels, select from: + {number_of_validation_labels}, {number_of_all_labels}""" + raise ValueError(msg) + + def _remap_from_zero(self, labels): + labels[ + ~np.isin(labels, list(self.label_info.keys())) + ] = self.ignore_label + # remap to the range from 0 + for i, k in enumerate(self.label_info.keys()): + labels[labels == k] = i + return labels + + def _remap_model_output(self, output): + output = np.array(output) + output_remapped = output.copy() + for i, k in enumerate(self.label_info.keys()): + output_remapped[output == i] = k + return output_remapped + + def augment_individual_instance( + self, coordinates, color, normals, labels, oversampling=1.0 + ): + max_instance = int(len(np.unique(labels[:, 1]))) + # randomly selecting half of non-zero instances + for instance in range(0, int(max_instance * oversampling)): + if self.place_around_existing: + center = choice( + coordinates[ + labels[:, 1] == choice(np.unique(labels[:, 1])) + ] + ) + else: + center = np.array( + [uniform(-5, 5), uniform(-5, 5), uniform(-0.5, 2)] + ) + instance = choice(choice(self.instance_data)) + instance = np.load(instance["instance_filepath"]) + # centering two objects + instance[:, :3] = ( + instance[:, :3] - instance[:, :3].mean(axis=0) + center + ) + max_instance = max_instance + 1 + instance[:, -1] = max_instance + aug = V.Compose( + [ + V.Scale3d(), + V.RotateAroundAxis3d( + rotation_limit=np.pi / 24, axis=(1, 0, 0) + ), + V.RotateAroundAxis3d( + rotation_limit=np.pi / 24, axis=(0, 1, 0) + ), + V.RotateAroundAxis3d(rotation_limit=np.pi, axis=(0, 0, 1)), + ] + )( + points=instance[:, :3], + features=instance[:, 3:6], + normals=instance[:, 6:9], + labels=instance[:, 9:], + ) + coordinates = np.concatenate((coordinates, aug["points"])) + color = np.concatenate((color, aug["features"])) + normals = np.concatenate((normals, aug["normals"])) + labels = np.concatenate((labels, aug["labels"])) + + return coordinates, color, normals, labels + + +def elastic_distortion(pointcloud, granularity, magnitude): + """Apply elastic distortion on sparse coordinate space. + + pointcloud: numpy array of (number of points, at least 3 spatial dims) + granularity: size of the noise grid (in same scale[m/cm] as the voxel grid) + magnitude: noise multiplier + """ + blurx = np.ones((3, 1, 1, 1)).astype("float32") / 3 + blury = np.ones((1, 3, 1, 1)).astype("float32") / 3 + blurz = np.ones((1, 1, 3, 1)).astype("float32") / 3 + coords = pointcloud[:, :3] + coords_min = coords.min(0) + + # Create Gaussian noise tensor of the size given by granularity. + noise_dim = ((coords - coords_min).max(0) // granularity).astype(int) + 3 + noise = np.random.randn(*noise_dim, 3).astype(np.float32) + + # Smoothing. + for _ in range(2): + noise = scipy.ndimage.filters.convolve( + noise, blurx, mode="constant", cval=0 + ) + noise = scipy.ndimage.filters.convolve( + noise, blury, mode="constant", cval=0 + ) + noise = scipy.ndimage.filters.convolve( + noise, blurz, mode="constant", cval=0 + ) + + # Trilinear interpolate noise filters for each spatial dimensions. + ax = [ + np.linspace(d_min, d_max, d) + for d_min, d_max, d in zip( + coords_min - granularity, + coords_min + granularity * (noise_dim - 2), + noise_dim, + ) + ] + interp = scipy.interpolate.RegularGridInterpolator( + ax, noise, bounds_error=0, fill_value=0 + ) + pointcloud[:, :3] = coords + interp(coords) * magnitude + return pointcloud + + +def crop(points, x_min, y_min, z_min, x_max, y_max, z_max): + if x_max <= x_min or y_max <= y_min or z_max <= z_min: + raise ValueError( + "We should have x_min < x_max and y_min < y_max and z_min < z_max. But we got" + " (x_min = {x_min}, y_min = {y_min}, z_min = {z_min}," + " x_max = {x_max}, y_max = {y_max}, z_max = {z_max})".format( + x_min=x_min, + x_max=x_max, + y_min=y_min, + y_max=y_max, + z_min=z_min, + z_max=z_max, + ) + ) + inds = np.all( + [ + (points[:, 0] >= x_min), + (points[:, 0] < x_max), + (points[:, 1] >= y_min), + (points[:, 1] < y_max), + (points[:, 2] >= z_min), + (points[:, 2] < z_max), + ], + axis=0, + ) + return inds + + +def flip_in_center(coordinates): + # moving coordinates to center + coordinates -= coordinates.mean(0) + aug = V.Compose( + [ + V.Flip3d(axis=(0, 1, 0), always_apply=True), + V.Flip3d(axis=(1, 0, 0), always_apply=True), + ] + ) + + first_crop = coordinates[:, 0] > 0 + first_crop &= coordinates[:, 1] > 0 + # x -y + second_crop = coordinates[:, 0] > 0 + second_crop &= coordinates[:, 1] < 0 + # -x y + third_crop = coordinates[:, 0] < 0 + third_crop &= coordinates[:, 1] > 0 + # -x -y + fourth_crop = coordinates[:, 0] < 0 + fourth_crop &= coordinates[:, 1] < 0 + + if first_crop.size > 1: + coordinates[first_crop] = aug(points=coordinates[first_crop])["points"] + if second_crop.size > 1: + minimum = coordinates[second_crop].min(0) + minimum[2] = 0 + minimum[0] = 0 + coordinates[second_crop] = aug(points=coordinates[second_crop])[ + "points" + ] + coordinates[second_crop] += minimum + if third_crop.size > 1: + minimum = coordinates[third_crop].min(0) + minimum[2] = 0 + minimum[1] = 0 + coordinates[third_crop] = aug(points=coordinates[third_crop])["points"] + coordinates[third_crop] += minimum + if fourth_crop.size > 1: + minimum = coordinates[fourth_crop].min(0) + minimum[2] = 0 + coordinates[fourth_crop] = aug(points=coordinates[fourth_crop])[ + "points" + ] + coordinates[fourth_crop] += minimum + + return coordinates + + +def random_around_points( + coordinates, + color, + normals, + labels, + rate=0.2, + noise_rate=0, + ignore_label=255, +): + coord_indexes = sample( + list(range(len(coordinates))), k=int(len(coordinates) * rate) + ) + noisy_coordinates = deepcopy(coordinates[coord_indexes]) + noisy_coordinates += np.random.uniform( + -0.2 - noise_rate, 0.2 + noise_rate, size=noisy_coordinates.shape + ) + + if noise_rate > 0: + noisy_color = np.random.randint(0, 255, size=noisy_coordinates.shape) + noisy_normals = np.random.rand(*noisy_coordinates.shape) * 2 - 1 + noisy_labels = np.full(labels[coord_indexes].shape, ignore_label) + + coordinates = np.vstack((coordinates, noisy_coordinates)) + color = np.vstack((color, noisy_color)) + normals = np.vstack((normals, noisy_normals)) + labels = np.vstack((labels, noisy_labels)) + else: + noisy_color = deepcopy(color[coord_indexes]) + noisy_normals = deepcopy(normals[coord_indexes]) + noisy_labels = deepcopy(labels[coord_indexes]) + + coordinates = np.vstack((coordinates, noisy_coordinates)) + color = np.vstack((color, noisy_color)) + normals = np.vstack((normals, noisy_normals)) + labels = np.vstack((labels, noisy_labels)) + + return coordinates, color, normals, labels + + +def random_points( + coordinates, color, normals, labels, noise_rate=0.6, ignore_label=255 +): + max_boundary = coordinates.max(0) + 0.1 + min_boundary = coordinates.min(0) - 0.1 + + noisy_coordinates = int( + (max(max_boundary) - min(min_boundary)) / noise_rate + ) + + noisy_coordinates = np.array( + list( + product( + np.linspace( + min_boundary[0], max_boundary[0], noisy_coordinates + ), + np.linspace( + min_boundary[1], max_boundary[1], noisy_coordinates + ), + np.linspace( + min_boundary[2], max_boundary[2], noisy_coordinates + ), + ) + ) + ) + noisy_coordinates += np.random.uniform( + -noise_rate, noise_rate, size=noisy_coordinates.shape + ) + + noisy_color = np.random.randint(0, 255, size=noisy_coordinates.shape) + noisy_normals = np.random.rand(*noisy_coordinates.shape) * 2 - 1 + noisy_labels = np.full( + (noisy_coordinates.shape[0], labels.shape[1]), ignore_label + ) + + coordinates = np.vstack((coordinates, noisy_coordinates)) + color = np.vstack((color, noisy_color)) + normals = np.vstack((normals, noisy_normals)) + labels = np.vstack((labels, noisy_labels)) + return coordinates, color, normals, labels diff --git a/models/Mask3D/build/lib/mask3d/datasets/utils.py b/models/Mask3D/build/lib/mask3d/datasets/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..46d8dd7e112f9722e2af65a76f24191600764a00 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/datasets/utils.py @@ -0,0 +1,639 @@ +import MinkowskiEngine as ME +import numpy as np +import torch +from random import random + + +class VoxelizeCollate: + def __init__( + self, + ignore_label=255, + voxel_size=1, + mode="test", + small_crops=False, + very_small_crops=False, + batch_instance=False, + probing=False, + task="instance_segmentation", + ignore_class_threshold=100, + filter_out_classes=[], + label_offset=0, + num_queries=None, + ): + assert task in [ + "instance_segmentation", + "semantic_segmentation", + ], "task not known" + self.task = task + self.filter_out_classes = filter_out_classes + self.label_offset = label_offset + self.voxel_size = voxel_size + self.ignore_label = ignore_label + self.mode = mode + self.batch_instance = batch_instance + self.small_crops = small_crops + self.very_small_crops = very_small_crops + self.probing = probing + self.ignore_class_threshold = ignore_class_threshold + + self.num_queries = num_queries + + def __call__(self, batch): + if ("train" in self.mode) and ( + self.small_crops or self.very_small_crops + ): + batch = make_crops(batch) + if ("train" in self.mode) and self.very_small_crops: + batch = make_crops(batch) + return voxelize( + batch, + self.ignore_label, + self.voxel_size, + self.probing, + self.mode, + task=self.task, + ignore_class_threshold=self.ignore_class_threshold, + filter_out_classes=self.filter_out_classes, + label_offset=self.label_offset, + num_queries=self.num_queries, + ) + + +class VoxelizeCollateMerge: + def __init__( + self, + ignore_label=255, + voxel_size=1, + mode="test", + scenes=2, + small_crops=False, + very_small_crops=False, + batch_instance=False, + make_one_pc_noise=False, + place_nearby=False, + place_far=False, + proba=1, + probing=False, + task="instance_segmentation", + ): + assert task in [ + "instance_segmentation", + "semantic_segmentation", + ], "task not known" + self.task = task + self.mode = mode + self.scenes = scenes + self.small_crops = small_crops + self.very_small_crops = very_small_crops + self.ignore_label = ignore_label + self.voxel_size = voxel_size + self.batch_instance = batch_instance + self.make_one_pc_noise = make_one_pc_noise + self.place_nearby = place_nearby + self.place_far = place_far + self.proba = proba + self.probing = probing + + def __call__(self, batch): + if ( + ("train" in self.mode) + and (not self.make_one_pc_noise) + and (self.proba > random()) + ): + if self.small_crops or self.very_small_crops: + batch = make_crops(batch) + if self.very_small_crops: + batch = make_crops(batch) + if self.batch_instance: + batch = batch_instances(batch) + new_batch = [] + for i in range(0, len(batch), self.scenes): + batch_coordinates = [] + batch_features = [] + batch_labels = [] + + batch_filenames = "" + batch_raw_color = [] + batch_raw_normals = [] + + offset_instance_id = 0 + offset_segment_id = 0 + + for j in range(min(len(batch[i:]), self.scenes)): + batch_coordinates.append(batch[i + j][0]) + batch_features.append(batch[i + j][1]) + + if j == 0: + batch_filenames = batch[i + j][3] + else: + batch_filenames = ( + batch_filenames + f"+{batch[i + j][3]}" + ) + + batch_raw_color.append(batch[i + j][4]) + batch_raw_normals.append(batch[i + j][5]) + + # make instance ids and segment ids unique + # take care that -1 instances stay at -1 + batch_labels.append( + batch[i + j][2] + + [0, offset_instance_id, offset_segment_id] + ) + batch_labels[-1][batch[i + j][2][:, 1] == -1, 1] = -1 + + max_instance_id, max_segment_id = batch[i + j][2].max( + axis=0 + )[1:] + offset_segment_id = offset_segment_id + max_segment_id + 1 + offset_instance_id = ( + offset_instance_id + max_instance_id + 1 + ) + + if (len(batch_coordinates) == 2) and self.place_nearby: + border = batch_coordinates[0][:, 0].max() + border -= batch_coordinates[1][:, 0].min() + batch_coordinates[1][:, 0] += border + elif (len(batch_coordinates) == 2) and self.place_far: + batch_coordinates[1] += ( + np.random.uniform((-10, -10, -10), (10, 10, 10)) * 200 + ) + new_batch.append( + ( + np.vstack(batch_coordinates), + np.vstack(batch_features), + np.concatenate(batch_labels), + batch_filenames, + np.vstack(batch_raw_color), + np.vstack(batch_raw_normals), + ) + ) + # TODO WHAT ABOUT POINT2SEGMENT AND SO ON ... + batch = new_batch + elif ("train" in self.mode) and self.make_one_pc_noise: + new_batch = [] + for i in range(0, len(batch), 2): + if (i + 1) < len(batch): + new_batch.append( + [ + np.vstack((batch[i][0], batch[i + 1][0])), + np.vstack((batch[i][1], batch[i + 1][1])), + np.concatenate( + ( + batch[i][2], + np.full_like( + batch[i + 1][2], self.ignore_label + ), + ) + ), + ] + ) + new_batch.append( + [ + np.vstack((batch[i][0], batch[i + 1][0])), + np.vstack((batch[i][1], batch[i + 1][1])), + np.concatenate( + ( + np.full_like( + batch[i][2], self.ignore_label + ), + batch[i + 1][2], + ) + ), + ] + ) + else: + new_batch.append([batch[i][0], batch[i][1], batch[i][2]]) + batch = new_batch + # return voxelize(batch, self.ignore_label, self.voxel_size, self.probing, self.mode) + return voxelize( + batch, + self.ignore_label, + self.voxel_size, + self.probing, + self.mode, + task=self.task, + ) + + +def batch_instances(batch): + new_batch = [] + for sample in batch: + for instance_id in np.unique(sample[2][:, 1]): + new_batch.append( + ( + sample[0][sample[2][:, 1] == instance_id], + sample[1][sample[2][:, 1] == instance_id], + sample[2][sample[2][:, 1] == instance_id][:, 0], + ), + ) + return new_batch + + +def voxelize( + batch, + ignore_label, + voxel_size, + probing, + mode, + task, + ignore_class_threshold, + filter_out_classes, + label_offset, + num_queries, +): + ( + coordinates, + features, + labels, + original_labels, + inverse_maps, + original_colors, + original_normals, + original_coordinates, + idx, + ) = ([], [], [], [], [], [], [], [], []) + voxelization_dict = { + "ignore_label": ignore_label, + # "quantization_size": self.voxel_size, + "return_index": True, + "return_inverse": True, + } + + full_res_coords = [] + + for sample in batch: + idx.append(sample[7]) + original_coordinates.append(sample[6]) + original_labels.append(sample[2]) + full_res_coords.append(sample[0]) + original_colors.append(sample[4]) + original_normals.append(sample[5]) + + coords = np.floor(sample[0] / voxel_size) + voxelization_dict.update( + { + "coordinates": torch.from_numpy(coords).to("cpu").contiguous(), + "features": sample[1], + } + ) + + # maybe this change (_, _, ...) is not necessary and we can directly get out + # the sample coordinates? + _, _, unique_map, inverse_map = ME.utils.sparse_quantize( + **voxelization_dict + ) + inverse_maps.append(inverse_map) + + sample_coordinates = coords[unique_map] + coordinates.append(torch.from_numpy(sample_coordinates).int()) + sample_features = sample[1][unique_map] + features.append(torch.from_numpy(sample_features).float()) + if len(sample[2]) > 0: + sample_labels = sample[2][unique_map] + labels.append(torch.from_numpy(sample_labels).long()) + + # Concatenate all lists + input_dict = {"coords": coordinates, "feats": features} + if len(labels) > 0: + input_dict["labels"] = labels + coordinates, features, labels = ME.utils.sparse_collate(**input_dict) + else: + coordinates, features = ME.utils.sparse_collate(**input_dict) + labels = torch.Tensor([]) + + if probing: + return ( + NoGpu( + coordinates, + features, + original_labels, + inverse_maps, + ), + labels, + ) + + if mode == "test": + for i in range(len(input_dict["labels"])): + _, ret_index, ret_inv = np.unique( + input_dict["labels"][i][:, 0], + return_index=True, + return_inverse=True, + ) + input_dict["labels"][i][:, 0] = torch.from_numpy(ret_inv) + # input_dict["segment2label"].append(input_dict["labels"][i][ret_index][:, :-1]) + else: + input_dict["segment2label"] = [] + + if "labels" in input_dict: + for i in range(len(input_dict["labels"])): + # TODO BIGGER CHANGE CHECK!!! + _, ret_index, ret_inv = np.unique( + input_dict["labels"][i][:, -1], + return_index=True, + return_inverse=True, + ) + input_dict["labels"][i][:, -1] = torch.from_numpy(ret_inv) + input_dict["segment2label"].append( + input_dict["labels"][i][ret_index][:, :-1] + ) + + if "labels" in input_dict: + list_labels = input_dict["labels"] + + target = [] + target_full = [] + + if len(list_labels[0].shape) == 1: + for batch_id in range(len(list_labels)): + label_ids = list_labels[batch_id].unique() + if 255 in label_ids: + label_ids = label_ids[:-1] + + target.append( + { + "labels": label_ids, + "masks": list_labels[batch_id] + == label_ids.unsqueeze(1), + } + ) + else: + if mode == "test": + for i in range(len(input_dict["labels"])): + target.append( + {"point2segment": input_dict["labels"][i][:, 0]} + ) + target_full.append( + { + "point2segment": torch.from_numpy( + original_labels[i][:, 0] + ).long() + } + ) + else: + target = get_instance_masks( + list_labels, + list_segments=input_dict["segment2label"], + task=task, + ignore_class_threshold=ignore_class_threshold, + filter_out_classes=filter_out_classes, + label_offset=label_offset, + ) + for i in range(len(target)): + target[i]["point2segment"] = input_dict["labels"][i][:, 2] + if "train" not in mode: + target_full = get_instance_masks( + [torch.from_numpy(l) for l in original_labels], + task=task, + ignore_class_threshold=ignore_class_threshold, + filter_out_classes=filter_out_classes, + label_offset=label_offset, + ) + for i in range(len(target_full)): + target_full[i]["point2segment"] = torch.from_numpy( + original_labels[i][:, 2] + ).long() + else: + target = [] + target_full = [] + coordinates = [] + features = [] + + if "train" not in mode: + return ( + NoGpu( + coordinates, + features, + original_labels, + inverse_maps, + full_res_coords, + target_full, + original_colors, + original_normals, + original_coordinates, + idx, + ), + target, + [sample[3] for sample in batch], + ) + else: + return ( + NoGpu( + coordinates, + features, + original_labels, + inverse_maps, + full_res_coords, + ), + target, + [sample[3] for sample in batch], + ) + + +def get_instance_masks( + list_labels, + task, + list_segments=None, + ignore_class_threshold=100, + filter_out_classes=[], + label_offset=0, +): + target = [] + + for batch_id in range(len(list_labels)): + label_ids = [] + masks = [] + segment_masks = [] + instance_ids = list_labels[batch_id][:, 1].unique() + + for instance_id in instance_ids: + if instance_id == -1: + continue + + # TODO is it possible that a ignore class (255) is an instance??? + # instance == -1 ??? + tmp = list_labels[batch_id][ + list_labels[batch_id][:, 1] == instance_id + ] + label_id = tmp[0, 0] + + if ( + label_id in filter_out_classes + ): # floor, wall, undefined==255 is not included + continue + + if ( + 255 in filter_out_classes + and label_id.item() == 255 + and tmp.shape[0] < ignore_class_threshold + ): + continue + + label_ids.append(label_id) + masks.append(list_labels[batch_id][:, 1] == instance_id) + + if list_segments: + segment_mask = torch.zeros( + list_segments[batch_id].shape[0] + ).bool() + segment_mask[ + list_labels[batch_id][ + list_labels[batch_id][:, 1] == instance_id + ][:, 2].unique() + ] = True + segment_masks.append(segment_mask) + + if len(label_ids) == 0: + return list() + + label_ids = torch.stack(label_ids) + masks = torch.stack(masks) + if list_segments: + segment_masks = torch.stack(segment_masks) + + if task == "semantic_segmentation": + new_label_ids = [] + new_masks = [] + new_segment_masks = [] + for label_id in label_ids.unique(): + masking = label_ids == label_id + + new_label_ids.append(label_id) + new_masks.append(masks[masking, :].sum(dim=0).bool()) + + if list_segments: + new_segment_masks.append( + segment_masks[masking, :].sum(dim=0).bool() + ) + + label_ids = torch.stack(new_label_ids) + masks = torch.stack(new_masks) + + if list_segments: + segment_masks = torch.stack(new_segment_masks) + + target.append( + { + "labels": label_ids, + "masks": masks, + "segment_mask": segment_masks, + } + ) + else: + target.append({"labels": label_ids, "masks": masks}) + else: + l = torch.clamp(label_ids - label_offset, min=0) + + if list_segments: + target.append( + { + "labels": l, + "masks": masks, + "segment_mask": segment_masks, + } + ) + else: + target.append({"labels": l, "masks": masks}) + return target + + +def make_crops(batch): + new_batch = [] + # detupling + for scene in batch: + new_batch.append([scene[0], scene[1], scene[2]]) + batch = new_batch + new_batch = [] + for scene in batch: + # move to center for better quadrant split + scene[0][:, :3] -= scene[0][:, :3].mean(0) + + # BUGFIX - there always would be a point in every quadrant + scene[0] = np.vstack( + ( + scene[0], + np.array( + [ + [0.1, 0.1, 0.1], + [0.1, -0.1, 0.1], + [-0.1, 0.1, 0.1], + [-0.1, -0.1, 0.1], + ] + ), + ) + ) + scene[1] = np.vstack((scene[1], np.zeros((4, scene[1].shape[1])))) + scene[2] = np.concatenate( + (scene[2], np.full_like((scene[2]), 255)[:4]) + ) + + crop = scene[0][:, 0] > 0 + crop &= scene[0][:, 1] > 0 + if crop.size > 1: + new_batch.append([scene[0][crop], scene[1][crop], scene[2][crop]]) + + crop = scene[0][:, 0] > 0 + crop &= scene[0][:, 1] < 0 + if crop.size > 1: + new_batch.append([scene[0][crop], scene[1][crop], scene[2][crop]]) + + crop = scene[0][:, 0] < 0 + crop &= scene[0][:, 1] > 0 + if crop.size > 1: + new_batch.append([scene[0][crop], scene[1][crop], scene[2][crop]]) + + crop = scene[0][:, 0] < 0 + crop &= scene[0][:, 1] < 0 + if crop.size > 1: + new_batch.append([scene[0][crop], scene[1][crop], scene[2][crop]]) + + # moving all of them to center + for i in range(len(new_batch)): + new_batch[i][0][:, :3] -= new_batch[i][0][:, :3].mean(0) + return new_batch + + +class NoGpu: + def __init__( + self, + coordinates, + features, + original_labels=None, + inverse_maps=None, + full_res_coords=None, + target_full=None, + original_colors=None, + original_normals=None, + original_coordinates=None, + idx=None, + ): + """helper class to prevent gpu loading on lightning""" + self.coordinates = coordinates + self.features = features + self.original_labels = original_labels + self.inverse_maps = inverse_maps + self.full_res_coords = full_res_coords + self.target_full = target_full + self.original_colors = original_colors + self.original_normals = original_normals + self.original_coordinates = original_coordinates + self.idx = idx + + +class NoGpuMask: + def __init__( + self, + coordinates, + features, + original_labels=None, + inverse_maps=None, + masks=None, + labels=None, + ): + """helper class to prevent gpu loading on lightning""" + self.coordinates = coordinates + self.features = features + self.original_labels = original_labels + self.inverse_maps = inverse_maps + + self.masks = masks + self.labels = labels diff --git a/models/Mask3D/build/lib/mask3d/main_instance_segmentation.py b/models/Mask3D/build/lib/mask3d/main_instance_segmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..c2664673cb3a1fa16191e7baa82a50bbb8f5f195 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/main_instance_segmentation.py @@ -0,0 +1,114 @@ +import logging +import os +from hashlib import md5 +from uuid import uuid4 +import hydra +from dotenv import load_dotenv +from omegaconf import DictConfig, OmegaConf +from trainer.trainer import InstanceSegmentation, RegularCheckpointing +from pytorch_lightning.callbacks import ModelCheckpoint +from utils.utils import ( + flatten_dict, + load_baseline_model, + load_checkpoint_with_missing_or_exsessive_keys, + load_backbone_checkpoint_with_missing_or_exsessive_keys, +) +from pytorch_lightning import Trainer, seed_everything + + +def get_parameters(cfg: DictConfig): + logger = logging.getLogger(__name__) + load_dotenv(".env") + + # parsing input parameters + seed_everything(cfg.general.seed) + + # getting basic configuration + if cfg.general.get("gpus", None) is None: + cfg.general.gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None) + loggers = [] + + # cfg.general.experiment_id = "0" # str(Repo("./").commit())[:8] + # params = flatten_dict(OmegaConf.to_container(cfg, resolve=True)) + + # create unique id for experiments that are run locally + # unique_id = "_" + str(uuid4())[:4] + # cfg.general.version = md5(str(params).encode("utf-8")).hexdigest()[:8] + unique_id + + if not os.path.exists(cfg.general.save_dir): + os.makedirs(cfg.general.save_dir) + else: + print("EXPERIMENT ALREADY EXIST") + cfg["trainer"][ + "resume_from_checkpoint" + ] = f"{cfg.general.save_dir}/last-epoch.ckpt" + + for log in cfg.logging: + print(log) + # loggers.append(hydra.utils.instantiate(log)) + # loggers[-1].log_hyperparams( + # flatten_dict(OmegaConf.to_container(cfg, resolve=True)) + # ) + + model = InstanceSegmentation(cfg) + if cfg.general.backbone_checkpoint is not None: + cfg, model = load_backbone_checkpoint_with_missing_or_exsessive_keys( + cfg, model + ) + if cfg.general.checkpoint is not None: + cfg, model = load_checkpoint_with_missing_or_exsessive_keys(cfg, model) + + logger.info(flatten_dict(OmegaConf.to_container(cfg, resolve=True))) + return cfg, model, loggers + + +@hydra.main( + config_path="conf", config_name="config_base_instance_segmentation.yaml" +) +def train(cfg: DictConfig): + os.chdir(hydra.utils.get_original_cwd()) + cfg, model, loggers = get_parameters(cfg) + callbacks = [] + for cb in cfg.callbacks: + callbacks.append(hydra.utils.instantiate(cb)) + + callbacks.append(RegularCheckpointing()) + + runner = Trainer( + logger=loggers, + gpus=cfg.general.gpus, + callbacks=callbacks, + weights_save_path=str(cfg.general.save_dir), + **cfg.trainer, + ) + runner.fit(model) + + +@hydra.main( + config_path="conf", config_name="config_base_instance_segmentation.yaml" +) +def test(cfg: DictConfig): + # because hydra wants to change dir for some reason + os.chdir(hydra.utils.get_original_cwd()) + cfg, model, loggers = get_parameters(cfg) + runner = Trainer( + gpus=cfg.general.gpus, + logger=loggers, + weights_save_path=str(cfg.general.save_dir), + **cfg.trainer, + ) + runner.test(model) + + +@hydra.main( + config_path="conf", config_name="config_base_instance_segmentation.yaml" +) +def main(cfg: DictConfig): + if cfg["general"]["train_mode"]: + train(cfg) + else: + test(cfg) + + +if __name__ == "__main__": + main() diff --git a/models/Mask3D/build/lib/mask3d/models/__init__.py b/models/Mask3D/build/lib/mask3d/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b092c965bba4c734b49a7f4d2e3ab6fee8471d17 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/__init__.py @@ -0,0 +1,44 @@ +import mask3d.models.resunet as resunet +import mask3d.models.res16unet as res16unet +from mask3d.models.res16unet import ( + Res16UNet34C, + Res16UNet34A, + Res16UNet14A, + Res16UNet34D, + Res16UNet18D, + Res16UNet18B, + Custom30M, +) +from mask3d.models.mask3d import Mask3D + +MODELS = [] + + +def add_models(module): + MODELS.extend([getattr(module, a) for a in dir(module) if "Net" in a]) + + +add_models(resunet) +add_models(res16unet) +add_models(mask3d) + + +def get_models(): + """Returns a tuple of sample models.""" + return MODELS + + +def load_model(name): + """Creates and returns an instance of the model given its class name.""" + # Find the model class from its name + all_models = get_models() + mdict = {model.__name__: model for model in all_models} + if name not in mdict: + print("Invalid model index. Options are:") + # Display a list of valid model names + for model in all_models: + print(f"\t* {model.__name__}") + return None + NetClass = mdict[name] + + return NetClass diff --git a/models/Mask3D/build/lib/mask3d/models/criterion.py b/models/Mask3D/build/lib/mask3d/models/criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..19ce8bc8ecf4a0be08ce91e45857412a8d55efba --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/criterion.py @@ -0,0 +1,343 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py +# Modified for Mask3D +""" +MaskFormer criterion. +""" + +import torch +import torch.nn.functional as F +from torch import nn + +from detectron2.utils.comm import get_world_size +from detectron2.projects.point_rend.point_features import ( + get_uncertain_point_coords_with_randomness, + point_sample, +) + +from mask3d.models.misc import ( + is_dist_avail_and_initialized, + nested_tensor_from_tensor_list, +) + + +def dice_loss( + inputs: torch.Tensor, + targets: torch.Tensor, + num_masks: float, +): + """ + Compute the DICE loss, similar to generalized IOU for masks + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(-1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_masks + + +dice_loss_jit = torch.jit.script(dice_loss) # type: torch.jit.ScriptModule + + +def sigmoid_ce_loss( + inputs: torch.Tensor, + targets: torch.Tensor, + num_masks: float, +): + """ + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + Returns: + Loss tensor + """ + loss = F.binary_cross_entropy_with_logits( + inputs, targets, reduction="none" + ) + + return loss.mean(1).sum() / num_masks + + +sigmoid_ce_loss_jit = torch.jit.script( + sigmoid_ce_loss +) # type: torch.jit.ScriptModule + + +def calculate_uncertainty(logits): + """ + We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the + foreground class in `classes`. + Args: + logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or + class-agnostic, where R is the total number of predicted masks in all images and C is + the number of foreground classes. The values are logits. + Returns: + scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with + the most uncertain locations having the highest uncertainty score. + """ + assert logits.shape[1] == 1 + gt_class_logits = logits.clone() + return -(torch.abs(gt_class_logits)) + + +class SetCriterion(nn.Module): + """This class computes the loss for DETR. + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + """ + + def __init__( + self, + num_classes, + matcher, + weight_dict, + eos_coef, + losses, + num_points, + oversample_ratio, + importance_sample_ratio, + class_weights, + ): + """Create the criterion. + Parameters: + num_classes: number of object categories, omitting the special no-object category + matcher: module able to compute a matching between targets and proposals + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category + losses: list of all the losses to be applied. See get_loss for list of available losses. + """ + super().__init__() + self.num_classes = num_classes - 1 + self.class_weights = class_weights + self.matcher = matcher + self.weight_dict = weight_dict + self.eos_coef = eos_coef + self.losses = losses + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = self.eos_coef + + if self.class_weights != -1: + assert ( + len(self.class_weights) == self.num_classes + ), "CLASS WEIGHTS DO NOT MATCH" + empty_weight[:-1] = torch.tensor(self.class_weights) + + self.register_buffer("empty_weight", empty_weight) + + # pointwise mask loss parameters + self.num_points = num_points + self.oversample_ratio = oversample_ratio + self.importance_sample_ratio = importance_sample_ratio + + def loss_labels(self, outputs, targets, indices, num_masks, mask_type): + """Classification loss (NLL) + targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] + """ + assert "pred_logits" in outputs + src_logits = outputs["pred_logits"].float() + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat( + [t["labels"][J] for t, (_, J) in zip(targets, indices)] + ) + target_classes = torch.full( + src_logits.shape[:2], + self.num_classes, + dtype=torch.int64, + device=src_logits.device, + ) + target_classes[idx] = target_classes_o + + loss_ce = F.cross_entropy( + src_logits.transpose(1, 2), + target_classes, + self.empty_weight, + ignore_index=253, + ) + losses = {"loss_ce": loss_ce} + return losses + + def loss_masks(self, outputs, targets, indices, num_masks, mask_type): + """Compute the losses related to the masks: the focal loss and the dice loss. + targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] + """ + assert "pred_masks" in outputs + + loss_masks = [] + loss_dices = [] + + for batch_id, (map_id, target_id) in enumerate(indices): + map = outputs["pred_masks"][batch_id][:, map_id].T + target_mask = targets[batch_id][mask_type][target_id] + + if self.num_points != -1: + point_idx = torch.randperm( + target_mask.shape[1], device=target_mask.device + )[: int(self.num_points * target_mask.shape[1])] + else: + # sample all points + point_idx = torch.arange( + target_mask.shape[1], device=target_mask.device + ) + + num_masks = target_mask.shape[0] + map = map[:, point_idx] + target_mask = target_mask[:, point_idx].float() + + loss_masks.append(sigmoid_ce_loss_jit(map, target_mask, num_masks)) + loss_dices.append(dice_loss_jit(map, target_mask, num_masks)) + # del target_mask + return { + "loss_mask": torch.sum(torch.stack(loss_masks)), + "loss_dice": torch.sum(torch.stack(loss_dices)), + } + + src_idx = self._get_src_permutation_idx(indices) + tgt_idx = self._get_tgt_permutation_idx(indices) + src_masks = outputs["pred_masks"] + src_masks = src_masks[src_idx] + masks = [t[mask_type] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(src_masks) + target_masks = target_masks[tgt_idx] + + # No need to upsample predictions as we are using normalized coordinates :) + # N x 1 x H x W + src_masks = src_masks[:, None] + target_masks = target_masks[:, None] + + with torch.no_grad(): + # sample point_coords + point_coords = get_uncertain_point_coords_with_randomness( + src_masks, + lambda logits: calculate_uncertainty(logits), + self.num_points, + self.oversample_ratio, + self.importance_sample_ratio, + ) + # get gt labels + point_labels = point_sample( + target_masks, + point_coords, + align_corners=False, + ).squeeze(1) + + point_logits = point_sample( + src_masks, + point_coords, + align_corners=False, + ).squeeze(1) + + losses = { + "loss_mask": sigmoid_ce_loss_jit( + point_logits, point_labels, num_masks, mask_type + ), + "loss_dice": dice_loss_jit( + point_logits, point_labels, num_masks, mask_type + ), + } + + del src_masks + del target_masks + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat( + [torch.full_like(src, i) for i, (src, _) in enumerate(indices)] + ) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat( + [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)] + ) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_masks, mask_type): + loss_map = {"labels": self.loss_labels, "masks": self.loss_masks} + assert loss in loss_map, f"do you really want to compute {loss} loss?" + return loss_map[loss](outputs, targets, indices, num_masks, mask_type) + + def forward(self, outputs, targets, mask_type): + """This performs the loss computation. + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = { + k: v for k, v in outputs.items() if k != "aux_outputs" + } + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets, mask_type) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_masks = sum(len(t["labels"]) for t in targets) + num_masks = torch.as_tensor( + [num_masks], + dtype=torch.float, + device=next(iter(outputs.values())).device, + ) + if is_dist_avail_and_initialized(): + torch.distributed.all_reduce(num_masks) + num_masks = torch.clamp(num_masks / get_world_size(), min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update( + self.get_loss( + loss, outputs, targets, indices, num_masks, mask_type + ) + ) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "aux_outputs" in outputs: + for i, aux_outputs in enumerate(outputs["aux_outputs"]): + indices = self.matcher(aux_outputs, targets, mask_type) + for loss in self.losses: + l_dict = self.get_loss( + loss, + aux_outputs, + targets, + indices, + num_masks, + mask_type, + ) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + def __repr__(self): + head = "Criterion " + self.__class__.__name__ + body = [ + "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)), + "losses: {}".format(self.losses), + "weight_dict: {}".format(self.weight_dict), + "num_classes: {}".format(self.num_classes), + "eos_coef: {}".format(self.eos_coef), + "num_points: {}".format(self.num_points), + "oversample_ratio: {}".format(self.oversample_ratio), + "importance_sample_ratio: {}".format(self.importance_sample_ratio), + ] + _repr_indent = 4 + lines = [head] + [" " * _repr_indent + line for line in body] + return "\n".join(lines) diff --git a/models/Mask3D/build/lib/mask3d/models/mask3d.py b/models/Mask3D/build/lib/mask3d/models/mask3d.py new file mode 100644 index 0000000000000000000000000000000000000000..0e09440cfacc68a961af8231f8205bf1daf6a134 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/mask3d.py @@ -0,0 +1,870 @@ +import torch +import hydra +import torch.nn as nn +import MinkowskiEngine.MinkowskiOps as me +from MinkowskiEngine.MinkowskiPooling import MinkowskiAvgPooling +import numpy as np +from torch.nn import functional as F +from mask3d.models.modules.common import conv +from mask3d.models.position_embedding import PositionEmbeddingCoordsSine +from mask3d.models.modules.helpers_3detr import GenericMLP +from torch_scatter import scatter_mean, scatter_max, scatter_min +from torch.cuda.amp import autocast + +from pointnet2.pointnet2_utils import furthest_point_sample + + +class Mask3D(nn.Module): + def __init__( + self, + config, + hidden_dim, + num_queries, + num_heads, + dim_feedforward, + sample_sizes, + shared_decoder, + num_classes, + num_decoders, + dropout, + pre_norm, + positional_encoding_type, + non_parametric_queries, + train_on_segments, + normalize_pos_enc, + use_level_embed, + scatter_type, + hlevels, + use_np_features, + voxel_size, + max_sample_size, + random_queries, + gauss_scale, + random_query_both, + random_normal, + ): + super().__init__() + self.random_normal = random_normal + self.random_query_both = random_query_both + self.random_queries = random_queries + self.max_sample_size = max_sample_size + self.gauss_scale = gauss_scale + self.voxel_size = voxel_size + self.scatter_type = scatter_type + self.hlevels = hlevels + self.use_level_embed = use_level_embed + self.train_on_segments = train_on_segments + self.normalize_pos_enc = normalize_pos_enc + self.num_decoders = num_decoders + self.num_classes = num_classes + self.dropout = dropout + self.pre_norm = pre_norm + self.shared_decoder = shared_decoder + self.sample_sizes = sample_sizes + self.non_parametric_queries = non_parametric_queries + self.use_np_features = use_np_features + self.mask_dim = hidden_dim + self.num_heads = num_heads + self.num_queries = num_queries + self.pos_enc_type = positional_encoding_type + + self.backbone = hydra.utils.instantiate(config.backbone) + self.num_levels = len(self.hlevels) + sizes = self.backbone.PLANES[-5:] + + self.mask_features_head = conv( + self.backbone.PLANES[7], + self.mask_dim, + kernel_size=1, + stride=1, + bias=True, + D=3, + ) + + if self.scatter_type == "mean": + self.scatter_fn = scatter_mean + elif self.scatter_type == "max": + self.scatter_fn = lambda mask, p2s, dim: scatter_max( + mask, p2s, dim=dim + )[0] + else: + assert False, "Scatter function not known" + + assert ( + not use_np_features + ) or non_parametric_queries, "np features only with np queries" + + if self.non_parametric_queries: + self.query_projection = GenericMLP( + input_dim=self.mask_dim, + hidden_dims=[self.mask_dim], + output_dim=self.mask_dim, + use_conv=True, + output_use_activation=True, + hidden_use_bias=True, + ) + + if self.use_np_features: + self.np_feature_projection = nn.Sequential( + nn.Linear(sizes[-1], hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + ) + elif self.random_query_both: + self.query_projection = GenericMLP( + input_dim=2 * self.mask_dim, + hidden_dims=[2 * self.mask_dim], + output_dim=2 * self.mask_dim, + use_conv=True, + output_use_activation=True, + hidden_use_bias=True, + ) + else: + # PARAMETRIC QUERIES + # learnable query features + self.query_feat = nn.Embedding(num_queries, hidden_dim) + # learnable query p.e. + self.query_pos = nn.Embedding(num_queries, hidden_dim) + + if self.use_level_embed: + # learnable scale-level embedding + self.level_embed = nn.Embedding(self.num_levels, hidden_dim) + + self.mask_embed_head = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + ) + + self.class_embed_head = nn.Linear(hidden_dim, self.num_classes) + + if self.pos_enc_type == "legacy": + self.pos_enc = PositionalEncoding3D(channels=self.mask_dim) + elif self.pos_enc_type == "fourier": + self.pos_enc = PositionEmbeddingCoordsSine( + pos_type="fourier", + d_pos=self.mask_dim, + gauss_scale=self.gauss_scale, + normalize=self.normalize_pos_enc, + ) + elif self.pos_enc_type == "sine": + self.pos_enc = PositionEmbeddingCoordsSine( + pos_type="sine", + d_pos=self.mask_dim, + normalize=self.normalize_pos_enc, + ) + else: + assert False, "pos enc type not known" + + self.pooling = MinkowskiAvgPooling( + kernel_size=2, stride=2, dimension=3 + ) + + self.masked_transformer_decoder = nn.ModuleList() + self.cross_attention = nn.ModuleList() + self.self_attention = nn.ModuleList() + self.ffn_attention = nn.ModuleList() + self.lin_squeeze = nn.ModuleList() + + num_shared = self.num_decoders if not self.shared_decoder else 1 + + for _ in range(num_shared): + tmp_cross_attention = nn.ModuleList() + tmp_self_attention = nn.ModuleList() + tmp_ffn_attention = nn.ModuleList() + tmp_squeeze_attention = nn.ModuleList() + for i, hlevel in enumerate(self.hlevels): + tmp_cross_attention.append( + CrossAttentionLayer( + d_model=self.mask_dim, + nhead=self.num_heads, + dropout=self.dropout, + normalize_before=self.pre_norm, + ) + ) + + tmp_squeeze_attention.append( + nn.Linear(sizes[hlevel], self.mask_dim) + ) + + tmp_self_attention.append( + SelfAttentionLayer( + d_model=self.mask_dim, + nhead=self.num_heads, + dropout=self.dropout, + normalize_before=self.pre_norm, + ) + ) + + tmp_ffn_attention.append( + FFNLayer( + d_model=self.mask_dim, + dim_feedforward=dim_feedforward, + dropout=self.dropout, + normalize_before=self.pre_norm, + ) + ) + + self.cross_attention.append(tmp_cross_attention) + self.self_attention.append(tmp_self_attention) + self.ffn_attention.append(tmp_ffn_attention) + self.lin_squeeze.append(tmp_squeeze_attention) + + self.decoder_norm = nn.LayerNorm(hidden_dim) + + def get_pos_encs(self, coords): + pos_encodings_pcd = [] + + for i in range(len(coords)): + pos_encodings_pcd.append([[]]) + for coords_batch in coords[i].decomposed_features: + scene_min = coords_batch.min(dim=0)[0][None, ...] + scene_max = coords_batch.max(dim=0)[0][None, ...] + + with autocast(enabled=False): + tmp = self.pos_enc( + coords_batch[None, ...].float(), + input_range=[scene_min, scene_max], + ) + + pos_encodings_pcd[-1][0].append(tmp.squeeze(0).permute((1, 0))) + + return pos_encodings_pcd + + def forward( + self, x, point2segment=None, raw_coordinates=None, is_eval=False + ): + # print(point2segment) + pcd_features, aux = self.backbone(x) + + batch_size = len(x.decomposed_coordinates) + + with torch.no_grad(): + coordinates = me.SparseTensor( + features=raw_coordinates, + coordinate_manager=aux[-1].coordinate_manager, + coordinate_map_key=aux[-1].coordinate_map_key, + device=aux[-1].device, + ) + + coords = [coordinates] + for _ in reversed(range(len(aux) - 1)): + coords.append(self.pooling(coords[-1])) + + coords.reverse() + + pos_encodings_pcd = self.get_pos_encs(coords) + mask_features = self.mask_features_head(pcd_features) + if point2segment is not None: + mask_segments = [] + for i, mask_feature in enumerate( + mask_features.decomposed_features + ): + mask_segments.append( + self.scatter_fn(mask_feature, point2segment[i], dim=0) + ) + + sampled_coords = None + + if self.non_parametric_queries: + fps_idx = [ + furthest_point_sample( + x.decomposed_coordinates[i][None, ...].float(), + self.num_queries, + ) + .squeeze(0) + .long() + for i in range(len(x.decomposed_coordinates)) + ] + + sampled_coords = torch.stack( + [ + coordinates.decomposed_features[i][fps_idx[i].long(), :] + for i in range(len(fps_idx)) + ] + ) + + mins = torch.stack( + [ + coordinates.decomposed_features[i].min(dim=0)[0] + for i in range(len(coordinates.decomposed_features)) + ] + ) + maxs = torch.stack( + [ + coordinates.decomposed_features[i].max(dim=0)[0] + for i in range(len(coordinates.decomposed_features)) + ] + ) + + query_pos = self.pos_enc( + sampled_coords.float(), input_range=[mins, maxs] + ) # Batch, Dim, queries + query_pos = self.query_projection(query_pos) + + if not self.use_np_features: + queries = torch.zeros_like(query_pos).permute((0, 2, 1)) + else: + queries = torch.stack( + [ + pcd_features.decomposed_features[i][ + fps_idx[i].long(), : + ] + for i in range(len(fps_idx)) + ] + ) + queries = self.np_feature_projection(queries) + query_pos = query_pos.permute((2, 0, 1)) + elif self.random_queries: + query_pos = ( + torch.rand( + batch_size, + self.mask_dim, + self.num_queries, + device=x.device, + ) + - 0.5 + ) + + queries = torch.zeros_like(query_pos).permute((0, 2, 1)) + query_pos = query_pos.permute((2, 0, 1)) + elif self.random_query_both: + if not self.random_normal: + query_pos_feat = ( + torch.rand( + batch_size, + 2 * self.mask_dim, + self.num_queries, + device=x.device, + ) + - 0.5 + ) + else: + query_pos_feat = torch.randn( + batch_size, + 2 * self.mask_dim, + self.num_queries, + device=x.device, + ) + + queries = query_pos_feat[:, : self.mask_dim, :].permute((0, 2, 1)) + query_pos = query_pos_feat[:, self.mask_dim :, :].permute( + (2, 0, 1) + ) + else: + # PARAMETRIC QUERIES + queries = self.query_feat.weight.unsqueeze(0).repeat( + batch_size, 1, 1 + ) + query_pos = self.query_pos.weight.unsqueeze(1).repeat( + 1, batch_size, 1 + ) + + predictions_class = [] + predictions_mask = [] + + for decoder_counter in range(self.num_decoders): + if self.shared_decoder: + decoder_counter = 0 + for i, hlevel in enumerate(self.hlevels): + if point2segment is not None: + output_class, outputs_mask, attn_mask = self.mask_module( + queries, + mask_features, + mask_segments, + len(aux) - hlevel - 1, + ret_attn_mask=True, + point2segment=point2segment, + coords=coords, + ) + else: + output_class, outputs_mask, attn_mask = self.mask_module( + queries, + mask_features, + None, + len(aux) - hlevel - 1, + ret_attn_mask=True, + point2segment=None, + coords=coords, + ) + + decomposed_aux = aux[hlevel].decomposed_features + decomposed_attn = attn_mask.decomposed_features + + curr_sample_size = max( + [pcd.shape[0] for pcd in decomposed_aux] + ) + + if min([pcd.shape[0] for pcd in decomposed_aux]) == 1: + raise RuntimeError( + "only a single point gives nans in cross-attention" + ) + + if not (self.max_sample_size or is_eval): + curr_sample_size = min( + curr_sample_size, self.sample_sizes[hlevel] + ) + + rand_idx = [] + mask_idx = [] + for k in range(len(decomposed_aux)): + pcd_size = decomposed_aux[k].shape[0] + if pcd_size <= curr_sample_size: + # we do not need to sample + # take all points and pad the rest with zeroes and mask it + idx = torch.zeros( + curr_sample_size, + dtype=torch.long, + device=queries.device, + ) + + midx = torch.ones( + curr_sample_size, + dtype=torch.bool, + device=queries.device, + ) + + idx[:pcd_size] = torch.arange( + pcd_size, device=queries.device + ) + + midx[:pcd_size] = False # attend to first points + else: + # we have more points in pcd as we like to sample + # take a subset (no padding or masking needed) + idx = torch.randperm( + decomposed_aux[k].shape[0], device=queries.device + )[:curr_sample_size] + midx = torch.zeros( + curr_sample_size, + dtype=torch.bool, + device=queries.device, + ) # attend to all + + rand_idx.append(idx) + mask_idx.append(midx) + + batched_aux = torch.stack( + [ + decomposed_aux[k][rand_idx[k], :] + for k in range(len(rand_idx)) + ] + ) + + batched_attn = torch.stack( + [ + decomposed_attn[k][rand_idx[k], :] + for k in range(len(rand_idx)) + ] + ) + + batched_pos_enc = torch.stack( + [ + pos_encodings_pcd[hlevel][0][k][rand_idx[k], :] + for k in range(len(rand_idx)) + ] + ) + + batched_attn.permute((0, 2, 1))[ + batched_attn.sum(1) == rand_idx[0].shape[0] + ] = False + + m = torch.stack(mask_idx) + batched_attn = torch.logical_or(batched_attn, m[..., None]) + + src_pcd = self.lin_squeeze[decoder_counter][i]( + batched_aux.permute((1, 0, 2)) + ) + if self.use_level_embed: + src_pcd += self.level_embed.weight[i] + + output = self.cross_attention[decoder_counter][i]( + queries.permute((1, 0, 2)), + src_pcd, + memory_mask=batched_attn.repeat_interleave( + self.num_heads, dim=0 + ).permute((0, 2, 1)), + memory_key_padding_mask=None, # here we do not apply masking on padded region + pos=batched_pos_enc.permute((1, 0, 2)), + query_pos=query_pos, + ) + + output = self.self_attention[decoder_counter][i]( + output, + tgt_mask=None, + tgt_key_padding_mask=None, + query_pos=query_pos, + ) + + # FFN + queries = self.ffn_attention[decoder_counter][i]( + output + ).permute((1, 0, 2)) + + predictions_class.append(output_class) + predictions_mask.append(outputs_mask) + + if point2segment is not None: + output_class, outputs_mask = self.mask_module( + queries, + mask_features, + mask_segments, + 0, + ret_attn_mask=False, + point2segment=point2segment, + coords=coords, + ) + else: + output_class, outputs_mask = self.mask_module( + queries, + mask_features, + None, + 0, + ret_attn_mask=False, + point2segment=None, + coords=coords, + ) + predictions_class.append(output_class) + predictions_mask.append(outputs_mask) + + return { + "pred_logits": predictions_class[-1], + "pred_masks": predictions_mask[-1], + "aux_outputs": self._set_aux_loss( + predictions_class, predictions_mask + ), + "sampled_coords": sampled_coords.detach().cpu().numpy() + if sampled_coords is not None + else None, + "backbone_features": pcd_features, + } + + def mask_module( + self, + query_feat, + mask_features, + mask_segments, + num_pooling_steps, + ret_attn_mask=True, + point2segment=None, + coords=None, + ): + query_feat = self.decoder_norm(query_feat) + mask_embed = self.mask_embed_head(query_feat) + outputs_class = self.class_embed_head(query_feat) + + output_masks = [] + + if point2segment is not None: + output_segments = [] + for i in range(len(mask_segments)): + output_segments.append(mask_segments[i] @ mask_embed[i].T) + output_masks.append(output_segments[-1][point2segment[i]]) + else: + for i in range(mask_features.C[-1, 0] + 1): + output_masks.append( + mask_features.decomposed_features[i] @ mask_embed[i].T + ) + + output_masks = torch.cat(output_masks) + outputs_mask = me.SparseTensor( + features=output_masks, + coordinate_manager=mask_features.coordinate_manager, + coordinate_map_key=mask_features.coordinate_map_key, + ) + + if ret_attn_mask: + attn_mask = outputs_mask + for _ in range(num_pooling_steps): + attn_mask = self.pooling(attn_mask.float()) + + attn_mask = me.SparseTensor( + features=(attn_mask.F.detach().sigmoid() < 0.5), + coordinate_manager=attn_mask.coordinate_manager, + coordinate_map_key=attn_mask.coordinate_map_key, + ) + + if point2segment is not None: + return outputs_class, output_segments, attn_mask + else: + return ( + outputs_class, + outputs_mask.decomposed_features, + attn_mask, + ) + + if point2segment is not None: + return outputs_class, output_segments + else: + return outputs_class, outputs_mask.decomposed_features + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_seg_masks): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [ + {"pred_logits": a, "pred_masks": b} + for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) + ] + + +class PositionalEncoding3D(nn.Module): + def __init__(self, channels): + """ + :param channels: The last dimension of the tensor you want to apply pos emb to. + """ + self.orig_ch = channels + super(PositionalEncoding3D, self).__init__() + channels = int(np.ceil(channels / 6) * 2) + if channels % 2: + channels += 1 + self.channels = channels + inv_freq = 1.0 / ( + 10000 ** (torch.arange(0, channels, 2).float() / channels) + ) + self.register_buffer("inv_freq", inv_freq) + + def forward(self, tensor, input_range=None): + """ + :param tensor: A 5d tensor of size (batch_size, x, y, z, ch) + :return: Positional Encoding Matrix of size (batch_size, x, y, z, ch) + """ + pos_x, pos_y, pos_z = tensor[:, :, 0], tensor[:, :, 1], tensor[:, :, 2] + sin_inp_x = torch.einsum("bi,j->bij", pos_x, self.inv_freq) + sin_inp_y = torch.einsum("bi,j->bij", pos_y, self.inv_freq) + sin_inp_z = torch.einsum("bi,j->bij", pos_z, self.inv_freq) + emb_x = torch.cat((sin_inp_x.sin(), sin_inp_x.cos()), dim=-1) + + emb_y = torch.cat((sin_inp_y.sin(), sin_inp_y.cos()), dim=-1) + emb_z = torch.cat((sin_inp_z.sin(), sin_inp_z.cos()), dim=-1) + + emb = torch.cat((emb_x, emb_y, emb_z), dim=-1) + return emb[:, :, : self.orig_ch].permute((0, 2, 1)) + + +class SelfAttentionLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dropout=0.0, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + + self.norm = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_post( + self, tgt, tgt_mask=None, tgt_key_padding_mask=None, query_pos=None + ): + q = k = self.with_pos_embed(tgt, query_pos) + tgt2 = self.self_attn( + q, + k, + value=tgt, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask, + )[0] + tgt = tgt + self.dropout(tgt2) + tgt = self.norm(tgt) + + return tgt + + def forward_pre( + self, tgt, tgt_mask=None, tgt_key_padding_mask=None, query_pos=None + ): + tgt2 = self.norm(tgt) + q = k = self.with_pos_embed(tgt2, query_pos) + tgt2 = self.self_attn( + q, + k, + value=tgt2, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask, + )[0] + tgt = tgt + self.dropout(tgt2) + + return tgt + + def forward( + self, tgt, tgt_mask=None, tgt_key_padding_mask=None, query_pos=None + ): + if self.normalize_before: + return self.forward_pre( + tgt, tgt_mask, tgt_key_padding_mask, query_pos + ) + return self.forward_post( + tgt, tgt_mask, tgt_key_padding_mask, query_pos + ) + + +class CrossAttentionLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dropout=0.0, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.multihead_attn = nn.MultiheadAttention( + d_model, nhead, dropout=dropout + ) + + self.norm = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_post( + self, + tgt, + memory, + memory_mask=None, + memory_key_padding_mask=None, + pos=None, + query_pos=None, + ): + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask, + )[0] + tgt = tgt + self.dropout(tgt2) + tgt = self.norm(tgt) + + return tgt + + def forward_pre( + self, + tgt, + memory, + memory_mask=None, + memory_key_padding_mask=None, + pos=None, + query_pos=None, + ): + tgt2 = self.norm(tgt) + + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt2, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask, + )[0] + tgt = tgt + self.dropout(tgt2) + + return tgt + + def forward( + self, + tgt, + memory, + memory_mask=None, + memory_key_padding_mask=None, + pos=None, + query_pos=None, + ): + if self.normalize_before: + return self.forward_pre( + tgt, + memory, + memory_mask, + memory_key_padding_mask, + pos, + query_pos, + ) + return self.forward_post( + tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos + ) + + +class FFNLayer(nn.Module): + def __init__( + self, + d_model, + dim_feedforward=2048, + dropout=0.0, + activation="relu", + normalize_before=False, + ): + super().__init__() + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm = nn.LayerNorm(d_model) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_post(self, tgt): + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout(tgt2) + tgt = self.norm(tgt) + return tgt + + def forward_pre(self, tgt): + tgt2 = self.norm(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout(tgt2) + return tgt + + def forward(self, tgt): + if self.normalize_before: + return self.forward_pre(tgt) + return self.forward_post(tgt) + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") diff --git a/models/Mask3D/build/lib/mask3d/models/matcher.py b/models/Mask3D/build/lib/mask3d/models/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..fc0e7a05bb76a078b1c3c3b9c877054e439b584c --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/matcher.py @@ -0,0 +1,226 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py +""" +Modules to compute the matching cost and solve the corresponding LSAP. +""" +import torch +import torch.nn.functional as F +from scipy.optimize import linear_sum_assignment +from torch import nn +from torch.cuda.amp import autocast + +from detectron2.projects.point_rend.point_features import point_sample + + +def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor): + """ + Compute the DICE loss, similar to generalized IOU for masks + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) + denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] + loss = 1 - (numerator + 1) / (denominator + 1) + return loss + + +batch_dice_loss_jit = torch.jit.script( + batch_dice_loss +) # type: torch.jit.ScriptModule + + +def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor): + """ + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + Returns: + Loss tensor + """ + hw = inputs.shape[1] + + pos = F.binary_cross_entropy_with_logits( + inputs, torch.ones_like(inputs), reduction="none" + ) + neg = F.binary_cross_entropy_with_logits( + inputs, torch.zeros_like(inputs), reduction="none" + ) + + loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( + "nc,mc->nm", neg, (1 - targets) + ) + + return loss / hw + + +batch_sigmoid_ce_loss_jit = torch.jit.script( + batch_sigmoid_ce_loss +) # type: torch.jit.ScriptModule + + +class HungarianMatcher(nn.Module): + """This class computes an assignment between the targets and the predictions of the network + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, + there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, + while the others are un-matched (and thus treated as non-objects). + """ + + def __init__( + self, + cost_class: float = 1, + cost_mask: float = 1, + cost_dice: float = 1, + num_points: int = 0, + ): + """Creates the matcher + + Params: + cost_class: This is the relative weight of the classification error in the matching cost + cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost + cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost + """ + super().__init__() + self.cost_class = cost_class + self.cost_mask = cost_mask + self.cost_dice = cost_dice + + assert ( + cost_class != 0 or cost_mask != 0 or cost_dice != 0 + ), "all costs cant be 0" + + self.num_points = num_points + + @torch.no_grad() + def memory_efficient_forward(self, outputs, targets, mask_type): + """More memory-friendly matching""" + bs, num_queries = outputs["pred_logits"].shape[:2] + + indices = [] + + # Iterate through batch size + for b in range(bs): + + out_prob = outputs["pred_logits"][b].softmax( + -1 + ) # [num_queries, num_classes] + tgt_ids = targets[b]["labels"].clone() + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + filter_ignore = tgt_ids == 253 + tgt_ids[filter_ignore] = 0 + cost_class = -out_prob[:, tgt_ids] + cost_class[ + :, filter_ignore + ] = ( + -1.0 + ) # for ignore classes pretend perfect match ;) TODO better worst class match? + + out_mask = outputs["pred_masks"][ + b + ].T # [num_queries, H_pred, W_pred] + # gt masks are already padded when preparing target + tgt_mask = targets[b][mask_type].to(out_mask) + + if self.num_points != -1: + point_idx = torch.randperm( + tgt_mask.shape[1], device=tgt_mask.device + )[: int(self.num_points * tgt_mask.shape[1])] + # point_idx = torch.randint(0, tgt_mask.shape[1], size=(self.num_points,), device=tgt_mask.device) + else: + # sample all points + point_idx = torch.arange( + tgt_mask.shape[1], device=tgt_mask.device + ) + + # out_mask = out_mask[:, None] + # tgt_mask = tgt_mask[:, None] + # all masks share the same set of points for efficient matching! + # point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device) + # get gt labels + # tgt_mask = point_sample( + # tgt_mask, + # point_coords.repeat(tgt_mask.shape[0], 1, 1), + # align_corners=False, + # ).squeeze(1) + + # out_mask = point_sample( + # out_mask, + # point_coords.repeat(out_mask.shape[0], 1, 1), + # align_corners=False, + # ).squeeze(1) + + with autocast(enabled=False): + out_mask = out_mask.float() + tgt_mask = tgt_mask.float() + # Compute the focal loss between masks + cost_mask = batch_sigmoid_ce_loss_jit( + out_mask[:, point_idx], tgt_mask[:, point_idx] + ) + + # Compute the dice loss betwen masks + cost_dice = batch_dice_loss_jit( + out_mask[:, point_idx], tgt_mask[:, point_idx] + ) + + # Final cost matrix + C = ( + self.cost_mask * cost_mask + + self.cost_class * cost_class + + self.cost_dice * cost_dice + ) + C = C.reshape(num_queries, -1).cpu() + + indices.append(linear_sum_assignment(C)) + + return [ + ( + torch.as_tensor(i, dtype=torch.int64), + torch.as_tensor(j, dtype=torch.int64), + ) + for i, j in indices + ] + + @torch.no_grad() + def forward(self, outputs, targets, mask_type): + """Performs the matching + + Params: + outputs: This is a dict that contains at least these entries: + "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks + + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels + "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + return self.memory_efficient_forward(outputs, targets, mask_type) + + def __repr__(self, _repr_indent=4): + head = "Matcher " + self.__class__.__name__ + body = [ + "cost_class: {}".format(self.cost_class), + "cost_mask: {}".format(self.cost_mask), + "cost_dice: {}".format(self.cost_dice), + ] + lines = [head] + [" " * _repr_indent + line for line in body] + return "\n".join(lines) diff --git a/models/Mask3D/build/lib/mask3d/models/metrics/__init__.py b/models/Mask3D/build/lib/mask3d/models/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bd7538b5868b93e4192dbee9ca0da9e91323cf0f --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/metrics/__init__.py @@ -0,0 +1,4 @@ +from .confusionmatrix import ConfusionMatrix +from .metrics import IoU + +__all__ = ["ConfusionMatrix", "IoU"] diff --git a/models/Mask3D/build/lib/mask3d/models/metrics/confusionmatrix.py b/models/Mask3D/build/lib/mask3d/models/metrics/confusionmatrix.py new file mode 100644 index 0000000000000000000000000000000000000000..2d92f12595d26f76f3c26d18550b1b1486b837ff --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/metrics/confusionmatrix.py @@ -0,0 +1,107 @@ +import numpy as np +import torch + + +class ConfusionMatrix: + """Constructs a confusion matrix for a multi-class classification problems. + + Does not support multi-label, multi-class problems. + + Keyword arguments: + - num_classes (int): number of classes in the classification problem. + - normalized (boolean, optional): Determines whether or not the confusion + matrix is normalized or not. Default: False. + + Modified from: https://github.com/pytorch/tnt/blob/master/torchnet/meter/confusionmeter.py + """ + + def __init__(self, num_classes, ignore_label): + super().__init__() + + self.conf = np.ndarray((num_classes, num_classes), dtype=np.int32) + self.ignore_label = ignore_label + self.num_classes = num_classes + self.reset() + + def reset(self): + self.conf.fill(0) + + def add(self, predicted, target): + """Computes the confusion matrix + + The shape of the confusion matrix is K x K, where K is the number + of classes. + + Keyword arguments: + - predicted (Tensor or numpy.ndarray): Can be an N x K tensor/array of + predicted scores obtained from the model for N examples and K classes, + or an N-tensor/array of integer values between 0 and K-1. + - target (Tensor or numpy.ndarray): Can be an N x K tensor/array of + ground-truth classes for N examples and K classes, or an N-tensor/array + of integer values between 0 and K-1. + + """ + # _, predicted = predicted.max(1) + + # predicted = predicted.view(-1) + # target = target.view(-1) + + # If target and/or predicted are tensors, convert them to numpy arrays + if torch.is_tensor(predicted): + predicted = predicted.cpu().numpy() + if torch.is_tensor(target): + target = target.cpu().numpy() + ind = ~np.isin(target, self.ignore_label) + predicted, target = predicted[ind], target[ind] + + assert ( + predicted.shape[0] == target.shape[0] + ), "number of targets and predicted outputs do not match" + + if np.ndim(predicted) != 1: + assert ( + predicted.shape[1] == self.num_classes + ), "number of predictions does not match size of confusion matrix" + predicted = np.argmax(predicted, 1) + else: + assert (predicted.max() < self.num_classes) and ( + predicted.min() >= 0 + ), "predicted values are not between 0 and k-1" + + if np.ndim(target) != 1: + assert ( + target.shape[1] == self.num_classes + ), "Onehot target does not match size of confusion matrix" + assert (target >= 0).all() and ( + target <= 1 + ).all(), "in one-hot encoding, target values should be 0 or 1" + assert ( + target.sum(1) == 1 + ).all(), "multi-label setting is not supported" + target = np.argmax(target, 1) + else: + assert (target.max() < self.num_classes) and ( + target.min() >= 0 + ), "target values are not between 0 and k-1" + + # hack for bincounting 2 arrays together + x = predicted + self.num_classes * target + bincount_2d = np.bincount( + x.astype(np.int32), minlength=self.num_classes**2 + ) + assert bincount_2d.size == self.num_classes**2 + conf = bincount_2d.reshape((self.num_classes, self.num_classes)) + + self.conf += conf + + def value(self, normalized=False): + """ + Returns: + Confustion matrix of K rows and K columns, where rows corresponds + to ground-truth targets and columns corresponds to predicted + targets. + """ + if normalized: + conf = self.conf.astype(np.float32) + return conf / conf.sum(1).clip(min=1e-12)[:, None] + return self.conf diff --git a/models/Mask3D/build/lib/mask3d/models/metrics/metrics.py b/models/Mask3D/build/lib/mask3d/models/metrics/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..f3f4b0ca4f7b0c5224ea242f459374a28485539f --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/metrics/metrics.py @@ -0,0 +1,48 @@ +import numpy as np + + +class IoU: + """Computes the intersection over union (IoU) per class and corresponding + mean (mIoU). + + Intersection over union (IoU) is a common evaluation metric for semantic + segmentation. The predictions are first accumulated in a confusion matrix + and the IoU is computed from it as follows: + + IoU = true_positive / (true_positive + false_positive + false_negative). + + Keyword arguments: + - num_classes (int): number of classes in the classification problem + - normalized (boolean, optional): Determines whether or not the confusion + matrix is normalized or not. Default: False. + - ignore_index (int or iterable, optional): Index of the classes to ignore + when computing the IoU. Can be an int, or any iterable of ints. + + Modified from: https://github.com/pytorch/tnt/blob/master/torchnet/meter + + """ + + def __init__(self): + super().__init__() + + def value(self, conf_matrix): + """Computes the IoU and mean IoU. + + The mean computation ignores NaN elements of the IoU array. + + Returns: + Tuple: (IoU, mIoU). The first output is the per class IoU, + for K classes it's numpy.ndarray with K elements. The second output, + is the mean IoU. + """ + true_positive = np.diag(conf_matrix) + false_positive = np.sum(conf_matrix, 0) - true_positive + false_negative = np.sum(conf_matrix, 1) - true_positive + + # Just in case we get a division by 0, ignore/hide the error + with np.errstate(divide="ignore", invalid="ignore"): + iou = true_positive / ( + true_positive + false_positive + false_negative + ) + + return iou diff --git a/models/Mask3D/build/lib/mask3d/models/misc.py b/models/Mask3D/build/lib/mask3d/models/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..8416b62804fbc002bd02a457d896276bc307b070 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/misc.py @@ -0,0 +1,119 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py +""" +Misc functions, including distributed helpers. + +Mostly copy-paste from torchvision references. +""" +from typing import List, Optional + +import torch +import torch.distributed as dist +import torchvision +from torch import Tensor + + +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + # type: (Device) -> NestedTensor # noqa + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + assert mask is not None + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + # TODO make this more general + if tensor_list[0].ndim == 3: + if torchvision._is_tracing(): + # nested_tensor_from_tensor_list() does not export well to ONNX + # call _onnx_nested_tensor_from_tensor_list() instead + return _onnx_nested_tensor_from_tensor_list(tensor_list) + + # TODO make it support different-sized images + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) + batch_shape = [len(tensor_list)] + max_size + b, c, h, w = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((b, h, w), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("not supported") + return NestedTensor(tensor, mask) + + +# _onnx_nested_tensor_from_tensor_list() is an implementation of +# nested_tensor_from_tensor_list() that is supported by ONNX tracing. +@torch.jit.unused +def _onnx_nested_tensor_from_tensor_list( + tensor_list: List[Tensor], +) -> NestedTensor: + max_size = [] + for i in range(tensor_list[0].dim()): + max_size_i = torch.max( + torch.stack([img.shape[i] for img in tensor_list]).to( + torch.float32 + ) + ).to(torch.int64) + max_size.append(max_size_i) + max_size = tuple(max_size) + + # work around for + # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + # m[: img.shape[1], :img.shape[2]] = False + # which is not yet supported in onnx + padded_imgs = [] + padded_masks = [] + for img in tensor_list: + padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] + padded_img = torch.nn.functional.pad( + img, (0, padding[2], 0, padding[1], 0, padding[0]) + ) + padded_imgs.append(padded_img) + + m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) + padded_mask = torch.nn.functional.pad( + m, (0, padding[2], 0, padding[1]), "constant", 1 + ) + padded_masks.append(padded_mask.to(torch.bool)) + + tensor = torch.stack(padded_imgs) + mask = torch.stack(padded_masks) + + return NestedTensor(tensor, mask=mask) + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True diff --git a/models/Mask3D/build/lib/mask3d/models/model.py b/models/Mask3D/build/lib/mask3d/models/model.py new file mode 100644 index 0000000000000000000000000000000000000000..d167fa58358f2c1a7ca4a509e38c61906e9dd7ac --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/model.py @@ -0,0 +1,27 @@ +from MinkowskiEngine import MinkowskiNetwork + + +class Model(MinkowskiNetwork): + """ + Base network for all sparse convnet + + By default, all networks are segmentation networks. + """ + + OUT_PIXEL_DIST = -1 + + def __init__(self, in_channels, out_channels, config, D, **kwargs): + super().__init__(D) + self.in_channels = in_channels + self.out_channels = out_channels + self.config = config + + +class HighDimensionalModel(Model): + """ + Base network for all spatio (temporal) chromatic sparse convnet + """ + + def __init__(self, in_channels, out_channels, config, D, **kwargs): + assert D > 4, "Num dimension smaller than 5" + super().__init__(in_channels, out_channels, config, D, **kwargs) diff --git a/models/Mask3D/build/lib/mask3d/models/modules/3detr_helpers.py b/models/Mask3D/build/lib/mask3d/models/modules/3detr_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..2c3f7ea57c0266a9781cdfec9f59896d15750a9d --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/modules/3detr_helpers.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import torch.nn as nn +from functools import partial +import copy + + +class BatchNormDim1Swap(nn.BatchNorm1d): + """ + Used for nn.Transformer that uses a HW x N x C rep + """ + + def forward(self, x): + """ + x: HW x N x C + permute to N x C x HW + Apply BN on C + permute back + """ + hw, n, c = x.shape + x = x.permute(1, 2, 0) + x = super(BatchNormDim1Swap, self).forward(x) + # x: n x c x hw -> hw x n x c + x = x.permute(2, 0, 1) + return x + + +NORM_DICT = { + "bn": BatchNormDim1Swap, + "bn1d": nn.BatchNorm1d, + "id": nn.Identity, + "ln": nn.LayerNorm, +} + +ACTIVATION_DICT = { + "relu": nn.ReLU, + "gelu": nn.GELU, + "leakyrelu": partial(nn.LeakyReLU, negative_slope=0.1), +} + +WEIGHT_INIT_DICT = { + "xavier_uniform": nn.init.xavier_uniform_, +} + + +class GenericMLP(nn.Module): + def __init__( + self, + input_dim, + hidden_dims, + output_dim, + norm_fn_name=None, + activation="relu", + use_conv=False, + dropout=None, + hidden_use_bias=False, + output_use_bias=True, + output_use_activation=False, + output_use_norm=False, + weight_init_name=None, + ): + super().__init__() + activation = ACTIVATION_DICT[activation] + norm = None + if norm_fn_name is not None: + norm = NORM_DICT[norm_fn_name] + if norm_fn_name == "ln" and use_conv: + norm = lambda x: nn.GroupNorm(1, x) # easier way to use LayerNorm + + if dropout is not None: + if not isinstance(dropout, list): + dropout = [dropout for _ in range(len(hidden_dims))] + + layers = [] + prev_dim = input_dim + for idx, x in enumerate(hidden_dims): + if use_conv: + layer = nn.Conv1d(prev_dim, x, 1, bias=hidden_use_bias) + else: + layer = nn.Linear(prev_dim, x, bias=hidden_use_bias) + layers.append(layer) + if norm: + layers.append(norm(x)) + layers.append(activation()) + if dropout is not None: + layers.append(nn.Dropout(p=dropout[idx])) + prev_dim = x + if use_conv: + layer = nn.Conv1d(prev_dim, output_dim, 1, bias=output_use_bias) + else: + layer = nn.Linear(prev_dim, output_dim, bias=output_use_bias) + layers.append(layer) + + if output_use_norm: + layers.append(norm(output_dim)) + + if output_use_activation: + layers.append(activation()) + + self.layers = nn.Sequential(*layers) + + if weight_init_name is not None: + self.do_weight_init(weight_init_name) + + def do_weight_init(self, weight_init_name): + func = WEIGHT_INIT_DICT[weight_init_name] + for (_, param) in self.named_parameters(): + if param.dim() > 1: # skips batchnorm/layernorm + func(param) + + def forward(self, x): + output = self.layers(x) + return output + + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/models/Mask3D/build/lib/mask3d/models/modules/__init__.py b/models/Mask3D/build/lib/mask3d/models/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/models/modules/common.py b/models/Mask3D/build/lib/mask3d/models/modules/common.py new file mode 100644 index 0000000000000000000000000000000000000000..ae78b5b301cfd6ffcfc3417b543ebe2289602fb7 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/modules/common.py @@ -0,0 +1,275 @@ +import sys + +if sys.version_info[:2] >= (3, 8): + from collections.abc import Sequence +else: + from collections import Sequence + +from enum import Enum + +import torch.nn as nn +import MinkowskiEngine as ME + + +class NormType(Enum): + BATCH_NORM = 0 + INSTANCE_NORM = 1 + INSTANCE_BATCH_NORM = 2 + + +def get_norm(norm_type, n_channels, D, bn_momentum=0.1): + if norm_type == NormType.BATCH_NORM: + return ME.MinkowskiBatchNorm(n_channels, momentum=bn_momentum) + elif norm_type == NormType.INSTANCE_NORM: + return ME.MinkowskiInstanceNorm(n_channels) + elif norm_type == NormType.INSTANCE_BATCH_NORM: + return nn.Sequential( + ME.MinkowskiInstanceNorm(n_channels), + ME.MinkowskiBatchNorm(n_channels, momentum=bn_momentum), + ) + else: + raise ValueError(f"Norm type: {norm_type} not supported") + + +class ConvType(Enum): + """ + Define the kernel region type + """ + + HYPERCUBE = 0, "HYPERCUBE" + SPATIAL_HYPERCUBE = 1, "SPATIAL_HYPERCUBE" + SPATIO_TEMPORAL_HYPERCUBE = 2, "SPATIO_TEMPORAL_HYPERCUBE" + HYPERCROSS = 3, "HYPERCROSS" + SPATIAL_HYPERCROSS = 4, "SPATIAL_HYPERCROSS" + SPATIO_TEMPORAL_HYPERCROSS = 5, "SPATIO_TEMPORAL_HYPERCROSS" + SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS = ( + 6, + "SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS ", + ) + + def __new__(cls, value, name): + member = object.__new__(cls) + member._value_ = value + member.fullname = name + return member + + def __int__(self): + return self.value + + +# Convert the ConvType var to a RegionType var +conv_to_region_type = { + # kernel_size = [k, k, k, 1] + ConvType.HYPERCUBE: ME.RegionType.HYPER_CUBE, + ConvType.SPATIAL_HYPERCUBE: ME.RegionType.HYPER_CUBE, + ConvType.SPATIO_TEMPORAL_HYPERCUBE: ME.RegionType.HYPER_CUBE, + ConvType.HYPERCROSS: ME.RegionType.HYPER_CROSS, + ConvType.SPATIAL_HYPERCROSS: ME.RegionType.HYPER_CROSS, + ConvType.SPATIO_TEMPORAL_HYPERCROSS: ME.RegionType.HYPER_CROSS, + ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS: ME.RegionType.HYPER_CUBE, # JONAS CHANGE from HYBRID +} + +# int_to_region_type = {m.value: m for m in ME.RegionType} +int_to_region_type = {m: ME.RegionType(m) for m in range(3)} + + +def convert_region_type(region_type): + """ + Convert the integer region_type to the corresponding RegionType enum object. + """ + return int_to_region_type[region_type] + + +def convert_conv_type(conv_type, kernel_size, D): + assert isinstance(conv_type, ConvType), "conv_type must be of ConvType" + region_type = conv_to_region_type[conv_type] + axis_types = None + if conv_type == ConvType.SPATIAL_HYPERCUBE: + # No temporal convolution + if isinstance(kernel_size, Sequence): + kernel_size = kernel_size[:3] + else: + kernel_size = [ + kernel_size, + ] * 3 + if D == 4: + kernel_size.append(1) + elif conv_type == ConvType.SPATIO_TEMPORAL_HYPERCUBE: + # conv_type conversion already handled + assert D == 4 + elif conv_type == ConvType.HYPERCUBE: + # conv_type conversion already handled + pass + elif conv_type == ConvType.SPATIAL_HYPERCROSS: + if isinstance(kernel_size, Sequence): + kernel_size = kernel_size[:3] + else: + kernel_size = [ + kernel_size, + ] * 3 + if D == 4: + kernel_size.append(1) + elif conv_type == ConvType.HYPERCROSS: + # conv_type conversion already handled + pass + elif conv_type == ConvType.SPATIO_TEMPORAL_HYPERCROSS: + # conv_type conversion already handled + assert D == 4 + elif conv_type == ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS: + # Define the CUBIC conv kernel for spatial dims and CROSS conv for temp dim + axis_types = [ + ME.RegionType.HYPER_CUBE, + ] * 3 + if D == 4: + axis_types.append(ME.RegionType.HYPER_CROSS) + return region_type, axis_types, kernel_size + + +def conv( + in_planes, + out_planes, + kernel_size, + stride=1, + dilation=1, + bias=False, + conv_type=ConvType.HYPERCUBE, + D=-1, +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + stride, + dilation, + region_type=region_type, + axis_types=None, # axis_types JONAS + dimension=D, + ) + + return ME.MinkowskiConvolution( + in_channels=in_planes, + out_channels=out_planes, + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + bias=bias, + kernel_generator=kernel_generator, + dimension=D, + ) + + +def conv_tr( + in_planes, + out_planes, + kernel_size, + upsample_stride=1, + dilation=1, + bias=False, + conv_type=ConvType.HYPERCUBE, + D=-1, +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + upsample_stride, + dilation, + region_type=region_type, + axis_types=axis_types, + dimension=D, + ) + + return ME.MinkowskiConvolutionTranspose( + in_channels=in_planes, + out_channels=out_planes, + kernel_size=kernel_size, + stride=upsample_stride, + dilation=dilation, + bias=bias, + kernel_generator=kernel_generator, + dimension=D, + ) + + +def avg_pool( + kernel_size, + stride=1, + dilation=1, + conv_type=ConvType.HYPERCUBE, + in_coords_key=None, + D=-1, +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + stride, + dilation, + region_type=region_type, + axis_types=axis_types, + dimension=D, + ) + + return ME.MinkowskiAvgPooling( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + kernel_generator=kernel_generator, + dimension=D, + ) + + +def avg_unpool( + kernel_size, stride=1, dilation=1, conv_type=ConvType.HYPERCUBE, D=-1 +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + stride, + dilation, + region_type=region_type, + axis_types=axis_types, + dimension=D, + ) + + return ME.MinkowskiAvgUnpooling( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + kernel_generator=kernel_generator, + dimension=D, + ) + + +def sum_pool( + kernel_size, stride=1, dilation=1, conv_type=ConvType.HYPERCUBE, D=-1 +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + stride, + dilation, + region_type=region_type, + axis_types=axis_types, + dimension=D, + ) + + return ME.MinkowskiSumPooling( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + kernel_generator=kernel_generator, + dimension=D, + ) diff --git a/models/Mask3D/build/lib/mask3d/models/modules/helpers_3detr.py b/models/Mask3D/build/lib/mask3d/models/modules/helpers_3detr.py new file mode 100644 index 0000000000000000000000000000000000000000..2c3f7ea57c0266a9781cdfec9f59896d15750a9d --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/modules/helpers_3detr.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import torch.nn as nn +from functools import partial +import copy + + +class BatchNormDim1Swap(nn.BatchNorm1d): + """ + Used for nn.Transformer that uses a HW x N x C rep + """ + + def forward(self, x): + """ + x: HW x N x C + permute to N x C x HW + Apply BN on C + permute back + """ + hw, n, c = x.shape + x = x.permute(1, 2, 0) + x = super(BatchNormDim1Swap, self).forward(x) + # x: n x c x hw -> hw x n x c + x = x.permute(2, 0, 1) + return x + + +NORM_DICT = { + "bn": BatchNormDim1Swap, + "bn1d": nn.BatchNorm1d, + "id": nn.Identity, + "ln": nn.LayerNorm, +} + +ACTIVATION_DICT = { + "relu": nn.ReLU, + "gelu": nn.GELU, + "leakyrelu": partial(nn.LeakyReLU, negative_slope=0.1), +} + +WEIGHT_INIT_DICT = { + "xavier_uniform": nn.init.xavier_uniform_, +} + + +class GenericMLP(nn.Module): + def __init__( + self, + input_dim, + hidden_dims, + output_dim, + norm_fn_name=None, + activation="relu", + use_conv=False, + dropout=None, + hidden_use_bias=False, + output_use_bias=True, + output_use_activation=False, + output_use_norm=False, + weight_init_name=None, + ): + super().__init__() + activation = ACTIVATION_DICT[activation] + norm = None + if norm_fn_name is not None: + norm = NORM_DICT[norm_fn_name] + if norm_fn_name == "ln" and use_conv: + norm = lambda x: nn.GroupNorm(1, x) # easier way to use LayerNorm + + if dropout is not None: + if not isinstance(dropout, list): + dropout = [dropout for _ in range(len(hidden_dims))] + + layers = [] + prev_dim = input_dim + for idx, x in enumerate(hidden_dims): + if use_conv: + layer = nn.Conv1d(prev_dim, x, 1, bias=hidden_use_bias) + else: + layer = nn.Linear(prev_dim, x, bias=hidden_use_bias) + layers.append(layer) + if norm: + layers.append(norm(x)) + layers.append(activation()) + if dropout is not None: + layers.append(nn.Dropout(p=dropout[idx])) + prev_dim = x + if use_conv: + layer = nn.Conv1d(prev_dim, output_dim, 1, bias=output_use_bias) + else: + layer = nn.Linear(prev_dim, output_dim, bias=output_use_bias) + layers.append(layer) + + if output_use_norm: + layers.append(norm(output_dim)) + + if output_use_activation: + layers.append(activation()) + + self.layers = nn.Sequential(*layers) + + if weight_init_name is not None: + self.do_weight_init(weight_init_name) + + def do_weight_init(self, weight_init_name): + func = WEIGHT_INIT_DICT[weight_init_name] + for (_, param) in self.named_parameters(): + if param.dim() > 1: # skips batchnorm/layernorm + func(param) + + def forward(self, x): + output = self.layers(x) + return output + + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/models/Mask3D/build/lib/mask3d/models/modules/resnet_block.py b/models/Mask3D/build/lib/mask3d/models/modules/resnet_block.py new file mode 100644 index 0000000000000000000000000000000000000000..ac16b72aa198964e343f57ad4f79193a22e830dc --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/modules/resnet_block.py @@ -0,0 +1,157 @@ +import torch.nn as nn +from MinkowskiEngine import MinkowskiReLU + +from mask3d.models.modules.common import ConvType, NormType, conv, get_norm + + +class BasicBlockBase(nn.Module): + expansion = 1 + NORM_TYPE = NormType.BATCH_NORM + + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + bn_momentum=0.1, + D=3, + ): + super().__init__() + + self.conv1 = conv( + inplanes, + planes, + kernel_size=3, + stride=stride, + dilation=dilation, + conv_type=conv_type, + D=D, + ) + self.norm1 = get_norm( + self.NORM_TYPE, planes, D, bn_momentum=bn_momentum + ) + self.conv2 = conv( + planes, + planes, + kernel_size=3, + stride=1, + dilation=dilation, + bias=False, + conv_type=conv_type, + D=D, + ) + self.norm2 = get_norm( + self.NORM_TYPE, planes, D, bn_momentum=bn_momentum + ) + self.relu = MinkowskiReLU(inplace=True) + self.downsample = downsample + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class BasicBlock(BasicBlockBase): + NORM_TYPE = NormType.BATCH_NORM + + +class BasicBlockIN(BasicBlockBase): + NORM_TYPE = NormType.INSTANCE_NORM + + +class BasicBlockINBN(BasicBlockBase): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM + + +class BottleneckBase(nn.Module): + expansion = 4 + NORM_TYPE = NormType.BATCH_NORM + + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + bn_momentum=0.1, + D=3, + ): + super().__init__() + self.conv1 = conv(inplanes, planes, kernel_size=1, D=D) + self.norm1 = get_norm( + self.NORM_TYPE, planes, D, bn_momentum=bn_momentum + ) + + self.conv2 = conv( + planes, + planes, + kernel_size=3, + stride=stride, + dilation=dilation, + conv_type=conv_type, + D=D, + ) + self.norm2 = get_norm( + self.NORM_TYPE, planes, D, bn_momentum=bn_momentum + ) + + self.conv3 = conv(planes, planes * self.expansion, kernel_size=1, D=D) + self.norm3 = get_norm( + self.NORM_TYPE, planes * self.expansion, D, bn_momentum=bn_momentum + ) + + self.relu = MinkowskiReLU(inplace=True) + self.downsample = downsample + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.norm3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(BottleneckBase): + NORM_TYPE = NormType.BATCH_NORM + + +class BottleneckIN(BottleneckBase): + NORM_TYPE = NormType.INSTANCE_NORM + + +class BottleneckINBN(BottleneckBase): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM diff --git a/models/Mask3D/build/lib/mask3d/models/modules/senet_block.py b/models/Mask3D/build/lib/mask3d/models/modules/senet_block.py new file mode 100644 index 0000000000000000000000000000000000000000..130082738505c79d5ecddb010595a5a66b9d8509 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/modules/senet_block.py @@ -0,0 +1,138 @@ +import torch.nn as nn +import MinkowskiEngine as ME + +from mix3d.models.modules.common import ConvType, NormType +from mix3d.models.modules.resnet_block import BasicBlock, Bottleneck + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=16, D=-1): + # Global coords does not require coords_key + super().__init__() + self.fc = nn.Sequential( + ME.MinkowskiLinear(channel, channel // reduction), + ME.MinkowskiReLU(inplace=True), + ME.MinkowskiLinear(channel // reduction, channel), + ME.MinkowskiSigmoid(), + ) + self.pooling = ME.MinkowskiGlobalPooling(dimension=D) + self.broadcast_mul = ME.MinkowskiBroadcastMultiplication(dimension=D) + + def forward(self, x): + y = self.pooling(x) + y = self.fc(y) + return self.broadcast_mul(x, y) + + +class SEBasicBlock(BasicBlock): + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + reduction=16, + D=-1, + ): + super().__init__( + inplanes, + planes, + stride=stride, + dilation=dilation, + downsample=downsample, + conv_type=conv_type, + D=D, + ) + self.se = SELayer(planes, reduction=reduction, D=D) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class SEBasicBlockSN(SEBasicBlock): + NORM_TYPE = NormType.SPARSE_SWITCH_NORM + + +class SEBasicBlockIN(SEBasicBlock): + NORM_TYPE = NormType.SPARSE_INSTANCE_NORM + + +class SEBasicBlockLN(SEBasicBlock): + NORM_TYPE = NormType.SPARSE_LAYER_NORM + + +class SEBottleneck(Bottleneck): + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + D=3, + reduction=16, + ): + super().__init__( + inplanes, + planes, + stride=stride, + dilation=dilation, + downsample=downsample, + conv_type=conv_type, + D=D, + ) + self.se = SELayer(planes * self.expansion, reduction=reduction, D=D) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.norm3(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class SEBottleneckSN(SEBottleneck): + NORM_TYPE = NormType.SPARSE_SWITCH_NORM + + +class SEBottleneckIN(SEBottleneck): + NORM_TYPE = NormType.SPARSE_INSTANCE_NORM + + +class SEBottleneckLN(SEBottleneck): + NORM_TYPE = NormType.SPARSE_LAYER_NORM diff --git a/models/Mask3D/build/lib/mask3d/models/position_embedding.py b/models/Mask3D/build/lib/mask3d/models/position_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..70275f1610e1d3f5ec8d11d18d298b7877204b86 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/position_embedding.py @@ -0,0 +1,179 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Various positional encodings for the transformer. +""" +import math +import torch +from torch import nn +import numpy as np + +# from utils.pc_util import shift_scale_points + + +def shift_scale_points(pred_xyz, src_range, dst_range=None): + """ + pred_xyz: B x N x 3 + src_range: [[B x 3], [B x 3]] - min and max XYZ coords + dst_range: [[B x 3], [B x 3]] - min and max XYZ coords + """ + if dst_range is None: + dst_range = [ + torch.zeros( + (src_range[0].shape[0], 3), device=src_range[0].device + ), + torch.ones((src_range[0].shape[0], 3), device=src_range[0].device), + ] + + if pred_xyz.ndim == 4: + src_range = [x[:, None] for x in src_range] + dst_range = [x[:, None] for x in dst_range] + + assert src_range[0].shape[0] == pred_xyz.shape[0] + assert dst_range[0].shape[0] == pred_xyz.shape[0] + assert src_range[0].shape[-1] == pred_xyz.shape[-1] + assert src_range[0].shape == src_range[1].shape + assert dst_range[0].shape == dst_range[1].shape + assert src_range[0].shape == dst_range[1].shape + + src_diff = src_range[1][:, None, :] - src_range[0][:, None, :] + dst_diff = dst_range[1][:, None, :] - dst_range[0][:, None, :] + prop_xyz = ( + ((pred_xyz - src_range[0][:, None, :]) * dst_diff) / src_diff + ) + dst_range[0][:, None, :] + return prop_xyz + + +class PositionEmbeddingCoordsSine(nn.Module): + def __init__( + self, + temperature=10000, + normalize=False, + scale=None, + pos_type="fourier", + d_pos=None, + d_in=3, + gauss_scale=1.0, + ): + super().__init__() + self.d_pos = d_pos + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + assert pos_type in ["sine", "fourier"] + self.pos_type = pos_type + self.scale = scale + if pos_type == "fourier": + assert d_pos is not None + assert d_pos % 2 == 0 + # define a gaussian matrix input_ch -> output_ch + B = torch.empty((d_in, d_pos // 2)).normal_() + B *= gauss_scale + self.register_buffer("gauss_B", B) + self.d_pos = d_pos + + def get_sine_embeddings(self, xyz, num_channels, input_range): + num_channels = self.d_pos + # clone coords so that shift/scale operations do not affect original tensor + orig_xyz = xyz + xyz = orig_xyz.clone() + + ncoords = xyz.shape[1] + if self.normalize: + xyz = shift_scale_points(xyz, src_range=input_range) + + ndim = num_channels // xyz.shape[2] + if ndim % 2 != 0: + ndim -= 1 + # automatically handle remainder by assiging it to the first dim + rems = num_channels - (ndim * xyz.shape[2]) + + assert ( + ndim % 2 == 0 + ), f"Cannot handle odd sized ndim={ndim} where num_channels={num_channels} and xyz={xyz.shape}" + + final_embeds = [] + prev_dim = 0 + + for d in range(xyz.shape[2]): + cdim = ndim + if rems > 0: + # add remainder in increments of two to maintain even size + cdim += 2 + rems -= 2 + + if cdim != prev_dim: + dim_t = torch.arange( + cdim, dtype=torch.float32, device=xyz.device + ) + dim_t = self.temperature ** (2 * (dim_t // 2) / cdim) + + # create batch x cdim x nccords embedding + raw_pos = xyz[:, :, d] + if self.scale: + raw_pos *= self.scale + pos = raw_pos[:, :, None] / dim_t + pos = torch.stack( + (pos[:, :, 0::2].sin(), pos[:, :, 1::2].cos()), dim=3 + ).flatten(2) + final_embeds.append(pos) + prev_dim = cdim + + final_embeds = torch.cat(final_embeds, dim=2).permute(0, 2, 1) + return final_embeds + + def get_fourier_embeddings(self, xyz, num_channels=None, input_range=None): + # Follows - https://people.eecs.berkeley.edu/~bmild/fourfeat/index.html + + if num_channels is None: + num_channels = self.gauss_B.shape[1] * 2 + + bsize, npoints = xyz.shape[0], xyz.shape[1] + assert num_channels > 0 and num_channels % 2 == 0 + d_in, max_d_out = self.gauss_B.shape[0], self.gauss_B.shape[1] + d_out = num_channels // 2 + assert d_out <= max_d_out + assert d_in == xyz.shape[-1] + + # clone coords so that shift/scale operations do not affect original tensor + orig_xyz = xyz + xyz = orig_xyz.clone() + + ncoords = xyz.shape[1] + if self.normalize: + xyz = shift_scale_points(xyz, src_range=input_range) + + xyz *= 2 * np.pi + xyz_proj = torch.mm(xyz.view(-1, d_in), self.gauss_B[:, :d_out]).view( + bsize, npoints, d_out + ) + final_embeds = [xyz_proj.sin(), xyz_proj.cos()] + + # return batch x d_pos x npoints embedding + final_embeds = torch.cat(final_embeds, dim=2).permute(0, 2, 1) + return final_embeds + + def forward(self, xyz, num_channels=None, input_range=None): + assert isinstance(xyz, torch.Tensor) + assert xyz.ndim == 3 + # xyz is batch x npoints x 3 + if self.pos_type == "sine": + with torch.no_grad(): + out = self.get_sine_embeddings(xyz, num_channels, input_range) + elif self.pos_type == "fourier": + with torch.no_grad(): + out = self.get_fourier_embeddings( + xyz, num_channels, input_range + ) + else: + raise ValueError(f"Unknown {self.pos_type}") + + return out + + def extra_repr(self): + st = f"type={self.pos_type}, scale={self.scale}, normalize={self.normalize}" + if hasattr(self, "gauss_B"): + st += f", gaussB={self.gauss_B.shape}, gaussBsum={self.gauss_B.sum().item()}" + return st diff --git a/models/Mask3D/build/lib/mask3d/models/res16unet.py b/models/Mask3D/build/lib/mask3d/models/res16unet.py new file mode 100644 index 0000000000000000000000000000000000000000..db771a6f12341b70d9e27e8f61efc2878b5d12c3 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/res16unet.py @@ -0,0 +1,444 @@ +import MinkowskiEngine.MinkowskiOps as me +from MinkowskiEngine import MinkowskiReLU + +from mask3d.models.resnet import ResNetBase, get_norm +from mask3d.models.modules.common import ConvType, NormType, conv, conv_tr +from mask3d.models.modules.resnet_block import BasicBlock, Bottleneck + + +class Res16UNetBase(ResNetBase): + BLOCK = None + PLANES = (32, 64, 128, 256, 256, 256, 256, 256) + DILATIONS = (1, 1, 1, 1, 1, 1, 1, 1) + LAYERS = (2, 2, 2, 2, 2, 2, 2, 2) + INIT_DIM = 32 + OUT_PIXEL_DIST = 1 + NORM_TYPE = NormType.BATCH_NORM + NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + # To use the model, must call initialize_coords before forward pass. + # Once data is processed, call clear to reset the model before calling initialize_coords + def __init__( + self, in_channels, out_channels, config, D=3, out_fpn=False, **kwargs + ): + super().__init__(in_channels, out_channels, config, D) + self.out_fpn = out_fpn + + def network_initialization(self, in_channels, out_channels, config, D): + # Setup net_metadata + dilations = self.DILATIONS + bn_momentum = config.bn_momentum + + def space_n_time_m(n, m): + return n if D == 3 else [n, n, n, m] + + if D == 4: + self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1) + + # Output of the first conv concated to conv6 + self.inplanes = self.INIT_DIM + self.conv0p1s1 = conv( + in_channels, + self.inplanes, + kernel_size=space_n_time_m(config.conv1_kernel_size, 1), + stride=1, + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + + self.bn0 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + + self.conv1p1s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn1 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block1 = self._make_layer( + self.BLOCK, + self.PLANES[0], + self.LAYERS[0], + dilation=dilations[0], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv2p2s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn2 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block2 = self._make_layer( + self.BLOCK, + self.PLANES[1], + self.LAYERS[1], + dilation=dilations[1], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv3p4s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn3 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block3 = self._make_layer( + self.BLOCK, + self.PLANES[2], + self.LAYERS[2], + dilation=dilations[2], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv4p8s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn4 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block4 = self._make_layer( + self.BLOCK, + self.PLANES[3], + self.LAYERS[3], + dilation=dilations[3], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr4p16s2 = conv_tr( + self.inplanes, + self.PLANES[4], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr4 = get_norm( + self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion + self.block5 = self._make_layer( + self.BLOCK, + self.PLANES[4], + self.LAYERS[4], + dilation=dilations[4], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr5p8s2 = conv_tr( + self.inplanes, + self.PLANES[5], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr5 = get_norm( + self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion + self.block6 = self._make_layer( + self.BLOCK, + self.PLANES[5], + self.LAYERS[5], + dilation=dilations[5], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr6p4s2 = conv_tr( + self.inplanes, + self.PLANES[6], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr6 = get_norm( + self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[6] + self.PLANES[0] * self.BLOCK.expansion + self.block7 = self._make_layer( + self.BLOCK, + self.PLANES[6], + self.LAYERS[6], + dilation=dilations[6], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr7p2s2 = conv_tr( + self.inplanes, + self.PLANES[7], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr7 = get_norm( + self.NORM_TYPE, self.PLANES[7], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[7] + self.INIT_DIM + self.block8 = self._make_layer( + self.BLOCK, + self.PLANES[7], + self.LAYERS[7], + dilation=dilations[7], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.final = conv( + self.PLANES[7], + out_channels, + kernel_size=1, + stride=1, + bias=True, + D=D, + ) + self.relu = MinkowskiReLU(inplace=True) + + def forward(self, x): + feature_maps = [] + + out = self.conv0p1s1(x) + out = self.bn0(out) + out_p1 = self.relu(out) + + out = self.conv1p1s2(out_p1) + out = self.bn1(out) + out = self.relu(out) + out_b1p2 = self.block1(out) + + out = self.conv2p2s2(out_b1p2) + out = self.bn2(out) + out = self.relu(out) + out_b2p4 = self.block2(out) + + out = self.conv3p4s2(out_b2p4) + out = self.bn3(out) + out = self.relu(out) + out_b3p8 = self.block3(out) + + # pixel_dist=16 + out = self.conv4p8s2(out_b3p8) + out = self.bn4(out) + out = self.relu(out) + out = self.block4(out) + + feature_maps.append(out) + + # pixel_dist=8 + out = self.convtr4p16s2(out) + out = self.bntr4(out) + out = self.relu(out) + + out = me.cat(out, out_b3p8) + out = self.block5(out) + + feature_maps.append(out) + + # pixel_dist=4 + out = self.convtr5p8s2(out) + out = self.bntr5(out) + out = self.relu(out) + + out = me.cat(out, out_b2p4) + out = self.block6(out) + + feature_maps.append(out) + + # pixel_dist=2 + out = self.convtr6p4s2(out) + out = self.bntr6(out) + out = self.relu(out) + + out = me.cat(out, out_b1p2) + out = self.block7(out) + + feature_maps.append(out) + + # pixel_dist=1 + out = self.convtr7p2s2(out) + out = self.bntr7(out) + out = self.relu(out) + + out = me.cat(out, out_p1) + out = self.block8(out) + + feature_maps.append(out) + + if not self.out_fpn: + return out + else: + return out, feature_maps + + +class Res16UNet14(Res16UNetBase): + BLOCK = BasicBlock + LAYERS = (1, 1, 1, 1, 1, 1, 1, 1) + + +class Res16UNet18(Res16UNetBase): + BLOCK = BasicBlock + LAYERS = (2, 2, 2, 2, 2, 2, 2, 2) + + +class Res16UNet34(Res16UNetBase): + BLOCK = BasicBlock + LAYERS = (2, 3, 4, 6, 2, 2, 2, 2) + + +class Res16UNet50(Res16UNetBase): + BLOCK = Bottleneck + LAYERS = (2, 3, 4, 6, 2, 2, 2, 2) + + +class Res16UNet101(Res16UNetBase): + BLOCK = Bottleneck + LAYERS = (2, 3, 4, 23, 2, 2, 2, 2) + + +class Res16UNet14A(Res16UNet14): + PLANES = (32, 64, 128, 256, 128, 128, 96, 96) + + +class Res16UNet14A2(Res16UNet14A): + LAYERS = (1, 1, 1, 1, 2, 2, 2, 2) + + +class Res16UNet14B(Res16UNet14): + PLANES = (32, 64, 128, 256, 128, 128, 128, 128) + + +class Res16UNet14B2(Res16UNet14B): + LAYERS = (1, 1, 1, 1, 2, 2, 2, 2) + + +class Res16UNet14B3(Res16UNet14B): + LAYERS = (2, 2, 2, 2, 1, 1, 1, 1) + + +class Res16UNet14C(Res16UNet14): + PLANES = (32, 64, 128, 256, 192, 192, 128, 128) + + +class Res16UNet14D(Res16UNet14): + PLANES = (32, 64, 128, 256, 384, 384, 384, 384) + + +class Res16UNet18A(Res16UNet18): + PLANES = (32, 64, 128, 256, 128, 128, 96, 96) + + +class Res16UNet18B(Res16UNet18): + PLANES = (32, 64, 128, 256, 128, 128, 128, 128) + + +class Res16UNet18D(Res16UNet18): + PLANES = (32, 64, 128, 256, 384, 384, 384, 384) + + +class Res16UNet34A(Res16UNet34): + PLANES = (32, 64, 128, 256, 256, 128, 64, 64) + + +class Res16UNet34B(Res16UNet34): + PLANES = (32, 64, 128, 256, 256, 128, 64, 32) + + +class Res16UNet34C(Res16UNet34): + PLANES = (32, 64, 128, 256, 256, 128, 96, 96) + + +class Custom30M(Res16UNet34): + PLANES = (32, 64, 128, 256, 128, 64, 64, 32) + + +class Res16UNet34D(Res16UNet34): + PLANES = (32, 64, 128, 256, 256, 128, 96, 128) + + +class STRes16UNetBase(Res16UNetBase): + + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + def __init__(self, in_channels, out_channels, config, D=4, **kwargs): + super().__init__(in_channels, out_channels, config, D, **kwargs) + + +class STRes16UNet14(STRes16UNetBase, Res16UNet14): + pass + + +class STRes16UNet14A(STRes16UNetBase, Res16UNet14A): + pass + + +class STRes16UNet18(STRes16UNetBase, Res16UNet18): + pass + + +class STRes16UNet34(STRes16UNetBase, Res16UNet34): + pass + + +class STRes16UNet50(STRes16UNetBase, Res16UNet50): + pass + + +class STRes16UNet101(STRes16UNetBase, Res16UNet101): + pass + + +class STRes16UNet18A(STRes16UNet18): + PLANES = (32, 64, 128, 256, 128, 128, 96, 96) + + +class STResTesseract16UNetBase(STRes16UNetBase): + pass + # CONV_TYPE = ConvType.HYPERCUBE + + +class STResTesseract16UNet18A(STRes16UNet18A, STResTesseract16UNetBase): + pass diff --git a/models/Mask3D/build/lib/mask3d/models/resnet.py b/models/Mask3D/build/lib/mask3d/models/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f6ad622893d191fce0cf9db6edafbc83f684d218 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/resnet.py @@ -0,0 +1,243 @@ +import torch.nn as nn +import MinkowskiEngine as ME + +from mask3d.models.model import Model +from mask3d.models.modules.common import ConvType, NormType, conv, get_norm, sum_pool +from mask3d.models.modules.resnet_block import BasicBlock, Bottleneck + + +class ResNetBase(Model): + BLOCK = None + LAYERS = () + INIT_DIM = 64 + PLANES = (64, 128, 256, 512) + OUT_PIXEL_DIST = 32 + HAS_LAST_BLOCK = False + CONV_TYPE = ConvType.HYPERCUBE + + def __init__(self, in_channels, out_channels, config, D=3, **kwargs): + assert self.BLOCK is not None + assert self.OUT_PIXEL_DIST > 0 + + super().__init__(in_channels, out_channels, config, D, **kwargs) + + self.network_initialization(in_channels, out_channels, config, D) + self.weight_initialization() + + def network_initialization(self, in_channels, out_channels, config, D): + def space_n_time_m(n, m): + return n if D == 3 else [n, n, n, m] + + if D == 4: + self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1) + + dilations = config.dilations + bn_momentum = config.bn_momentum + self.inplanes = self.INIT_DIM + self.conv1 = conv( + in_channels, + self.inplanes, + kernel_size=space_n_time_m(config.conv1_kernel_size, 1), + stride=1, + D=D, + ) + + self.bn1 = get_norm( + NormType.BATCH_NORM, + self.inplanes, + D=self.D, + bn_momentum=bn_momentum, + ) + self.relu = ME.MinkowskiReLU(inplace=True) + self.pool = sum_pool( + kernel_size=space_n_time_m(2, 1), stride=space_n_time_m(2, 1), D=D + ) + + self.layer1 = self._make_layer( + self.BLOCK, + self.PLANES[0], + self.LAYERS[0], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[0], 1), + ) + self.layer2 = self._make_layer( + self.BLOCK, + self.PLANES[1], + self.LAYERS[1], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[1], 1), + ) + self.layer3 = self._make_layer( + self.BLOCK, + self.PLANES[2], + self.LAYERS[2], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[2], 1), + ) + self.layer4 = self._make_layer( + self.BLOCK, + self.PLANES[3], + self.LAYERS[3], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[3], 1), + ) + + self.final = conv( + self.PLANES[3] * self.BLOCK.expansion, + out_channels, + kernel_size=1, + bias=True, + D=D, + ) + + def weight_initialization(self): + for m in self.modules(): + if isinstance(m, ME.MinkowskiBatchNorm): + nn.init.constant_(m.bn.weight, 1) + nn.init.constant_(m.bn.bias, 0) + + def _make_layer( + self, + block, + planes, + blocks, + stride=1, + dilation=1, + norm_type=NormType.BATCH_NORM, + bn_momentum=0.1, + ): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + D=self.D, + ), + get_norm( + norm_type, + planes * block.expansion, + D=self.D, + bn_momentum=bn_momentum, + ), + ) + layers = [] + layers.append( + block( + self.inplanes, + planes, + stride=stride, + dilation=dilation, + downsample=downsample, + conv_type=self.CONV_TYPE, + D=self.D, + ) + ) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + self.inplanes, + planes, + stride=1, + dilation=dilation, + conv_type=self.CONV_TYPE, + D=self.D, + ) + ) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.pool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.final(x) + return x + + +class ResNet14(ResNetBase): + BLOCK = BasicBlock + LAYERS = (1, 1, 1, 1) + + +class ResNet18(ResNetBase): + BLOCK = BasicBlock + LAYERS = (2, 2, 2, 2) + + +class ResNet34(ResNetBase): + BLOCK = BasicBlock + LAYERS = (3, 4, 6, 3) + + +class ResNet50(ResNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 6, 3) + + +class ResNet101(ResNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 23, 3) + + +class STResNetBase(ResNetBase): + + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + def __init__(self, in_channels, out_channels, config, D=4, **kwargs): + super().__init__(in_channels, out_channels, config, D, **kwargs) + + +class STResNet14(STResNetBase, ResNet14): + pass + + +class STResNet18(STResNetBase, ResNet18): + pass + + +class STResNet34(STResNetBase, ResNet34): + pass + + +class STResNet50(STResNetBase, ResNet50): + pass + + +class STResNet101(STResNetBase, ResNet101): + pass + + +class STResTesseractNetBase(STResNetBase): + CONV_TYPE = ConvType.HYPERCUBE + + +class STResTesseractNet14(STResTesseractNetBase, STResNet14): + pass + + +class STResTesseractNet18(STResTesseractNetBase, STResNet18): + pass + + +class STResTesseractNet34(STResTesseractNetBase, STResNet34): + pass + + +class STResTesseractNet50(STResTesseractNetBase, STResNet50): + pass + + +class STResTesseractNet101(STResTesseractNetBase, STResNet101): + pass diff --git a/models/Mask3D/build/lib/mask3d/models/resunet.py b/models/Mask3D/build/lib/mask3d/models/resunet.py new file mode 100644 index 0000000000000000000000000000000000000000..98a3adc56f09d534256960c080594e5df3a41c7c --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/resunet.py @@ -0,0 +1,617 @@ +import torch.nn as nn +import MinkowskiEngine as ME +import MinkowskiEngine.MinkowskiOps as me +from MinkowskiEngine import MinkowskiReLU + +from mask3d.models.resnet import ResNetBase, get_norm +from mask3d.models.modules.common import ConvType, NormType, conv, conv_tr +from mask3d.models.modules.resnet_block import BasicBlock, Bottleneck, BasicBlockINBN + + +class MinkUNetBase(ResNetBase): + BLOCK = None + PLANES = (64, 128, 256, 512, 256, 128, 128) + DILATIONS = (1, 1, 1, 1, 1, 1) + LAYERS = (2, 2, 2, 2, 2, 2) + INIT_DIM = 64 + OUT_PIXEL_DIST = 1 + NORM_TYPE = NormType.BATCH_NORM + NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + # To use the model, must call initialize_coords before forward pass. + # Once data is processed, call clear to reset the model before calling initialize_coords + def __init__(self, in_channels, out_channels, config, D=3, **kwargs): + super().__init__(in_channels, out_channels, config, D) + + def network_initialization(self, in_channels, out_channels, config, D): + # Setup net_metadata + dilations = self.DILATIONS + bn_momentum = config.bn_momentum + + def space_n_time_m(n, m): + return n if D == 3 else [n, n, n, m] + + if D == 4: + self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1) + + # Output of the first conv concated to conv6 + self.inplanes = self.INIT_DIM + self.conv1p1s1 = conv( + in_channels, + self.inplanes, + kernel_size=space_n_time_m(config.conv1_kernel_size, 1), + stride=1, + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + + self.bn1 = get_norm( + self.NORM_TYPE, self.PLANES[0], D, bn_momentum=bn_momentum + ) + self.block1 = self._make_layer( + self.BLOCK, + self.PLANES[0], + self.LAYERS[0], + dilation=dilations[0], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv2p1s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn2 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block2 = self._make_layer( + self.BLOCK, + self.PLANES[1], + self.LAYERS[1], + dilation=dilations[1], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv3p2s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn3 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block3 = self._make_layer( + self.BLOCK, + self.PLANES[2], + self.LAYERS[2], + dilation=dilations[2], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv4p4s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn4 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block4 = self._make_layer( + self.BLOCK, + self.PLANES[3], + self.LAYERS[3], + dilation=dilations[3], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr4p8s2 = conv_tr( + self.inplanes, + self.PLANES[4], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr4 = get_norm( + self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion + self.block5 = self._make_layer( + self.BLOCK, + self.PLANES[4], + self.LAYERS[4], + dilation=dilations[4], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr5p4s2 = conv_tr( + self.inplanes, + self.PLANES[5], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr5 = get_norm( + self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion + self.block6 = self._make_layer( + self.BLOCK, + self.PLANES[5], + self.LAYERS[5], + dilation=dilations[5], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr6p2s2 = conv_tr( + self.inplanes, + self.PLANES[6], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr6 = get_norm( + self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum + ) + self.relu = MinkowskiReLU(inplace=True) + + self.final = nn.Sequential( + conv( + self.PLANES[6] + self.PLANES[0] * self.BLOCK.expansion, + 512, + kernel_size=1, + stride=1, + dilation=1, + bias=False, + D=D, + ), + ME.MinkowskiBatchNorm(512), + ME.MinkowskiReLU(), + conv( + 512, + out_channels, + kernel_size=1, + stride=1, + dilation=1, + bias=True, + D=D, + ), + ) + + def forward(self, x): + out = self.conv1p1s1(x) + out = self.bn1(out) + out = self.relu(out) + + out_b1p1 = self.block1(out) + + out = self.conv2p1s2(out_b1p1) + out = self.bn2(out) + out = self.relu(out) + + out_b2p2 = self.block2(out) + + out = self.conv3p2s2(out_b2p2) + out = self.bn3(out) + out = self.relu(out) + + out_b3p4 = self.block3(out) + + out = self.conv4p4s2(out_b3p4) + out = self.bn4(out) + out = self.relu(out) + + # pixel_dist=8 + out = self.block4(out) + + out = self.convtr4p8s2(out) + out = self.bntr4(out) + out = self.relu(out) + + out = me.cat(out, out_b3p4) + out = self.block5(out) + + out = self.convtr5p4s2(out) + out = self.bntr5(out) + out = self.relu(out) + + out = me.cat(out, out_b2p2) + out = self.block6(out) + + out = self.convtr6p2s2(out) + out = self.bntr6(out) + out = self.relu(out) + + out = me.cat(out, out_b1p1) + return self.final(out) + + +class ResUNet14(MinkUNetBase): + BLOCK = BasicBlock + LAYERS = (1, 1, 1, 1, 1, 1) + + +class ResUNet18(MinkUNetBase): + BLOCK = BasicBlock + LAYERS = (2, 2, 2, 2, 2, 2) + + +class ResUNet18INBN(ResUNet18): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM + BLOCK = BasicBlockINBN + + +class ResUNet34(MinkUNetBase): + BLOCK = BasicBlock + LAYERS = (3, 4, 6, 3, 2, 2) + + +class ResUNet50(MinkUNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 6, 3, 2, 2) + + +class ResUNet101(MinkUNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 23, 3, 2, 2) + + +class ResUNet14D(ResUNet14): + PLANES = (64, 128, 256, 512, 512, 512, 512) + + +class ResUNet18D(ResUNet18): + PLANES = (64, 128, 256, 512, 512, 512, 512) + + +class ResUNet34D(ResUNet34): + PLANES = (64, 128, 256, 512, 512, 512, 512) + + +class ResUNet34E(ResUNet34): + INIT_DIM = 32 + PLANES = (32, 64, 128, 256, 128, 64, 64) + + +class ResUNet34F(ResUNet34): + INIT_DIM = 32 + PLANES = (32, 64, 128, 256, 128, 64, 32) + + +class MinkUNetHyper(MinkUNetBase): + BLOCK = None + PLANES = (64, 128, 256, 512, 256, 128, 128) + DILATIONS = (1, 1, 1, 1, 1, 1) + LAYERS = (2, 2, 2, 2, 2, 2) + INIT_DIM = 64 + OUT_PIXEL_DIST = 1 + NORM_TYPE = NormType.BATCH_NORM + NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + # To use the model, must call initialize_coords before forward pass. + # Once data is processed, call clear to reset the model before calling initialize_coords + def __init__(self, in_channels, out_channels, config, D=3, **kwargs): + super(MinkUNetBase, self).__init__( + in_channels, out_channels, config, D + ) + + def network_initialization(self, in_channels, out_channels, config, D): + # Setup net_metadata + dilations = self.DILATIONS + bn_momentum = config.bn_momentum + + def space_n_time_m(n, m): + return n if D == 3 else [n, n, n, m] + + if D == 4: + self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1) + + # Output of the first conv concated to conv6 + self.inplanes = self.INIT_DIM + self.conv1p1s1 = conv( + in_channels, + self.inplanes, + kernel_size=space_n_time_m(config.conv1_kernel_size, 1), + stride=1, + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + + self.bn1 = get_norm( + self.NORM_TYPE, self.PLANES[0], D, bn_momentum=bn_momentum + ) + self.block1 = self._make_layer( + self.BLOCK, + self.PLANES[0], + self.LAYERS[0], + dilation=dilations[0], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv2p1s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn2 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block2 = self._make_layer( + self.BLOCK, + self.PLANES[1], + self.LAYERS[1], + dilation=dilations[1], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv3p2s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn3 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block3 = self._make_layer( + self.BLOCK, + self.PLANES[2], + self.LAYERS[2], + dilation=dilations[2], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv4p4s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn4 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block4 = self._make_layer( + self.BLOCK, + self.PLANES[3], + self.LAYERS[3], + dilation=dilations[3], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.pool_tr4 = ME.MinkowskiPoolingTranspose( + kernel_size=8, stride=8, dimension=D + ) + _ = self.inplanes + self.convtr4p8s2 = conv_tr( + self.inplanes, + self.PLANES[4], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr4 = get_norm( + self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion + self.block5 = self._make_layer( + self.BLOCK, + self.PLANES[4], + self.LAYERS[4], + dilation=dilations[4], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.pool_tr5 = ME.MinkowskiPoolingTranspose( + kernel_size=4, stride=4, dimension=D + ) + out_pool5 = self.inplanes + self.convtr5p4s2 = conv_tr( + self.inplanes, + self.PLANES[5], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr5 = get_norm( + self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion + self.block6 = self._make_layer( + self.BLOCK, + self.PLANES[5], + self.LAYERS[5], + dilation=dilations[5], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.pool_tr6 = ME.MinkowskiPoolingTranspose( + kernel_size=2, stride=2, dimension=D + ) + out_pool6 = self.inplanes + self.convtr6p2s2 = conv_tr( + self.inplanes, + self.PLANES[6], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr6 = get_norm( + self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum + ) + + self.relu = MinkowskiReLU(inplace=True) + + self.final = nn.Sequential( + conv( + out_pool5 + + out_pool6 + + self.PLANES[6] + + self.PLANES[0] * self.BLOCK.expansion, + 512, + kernel_size=1, + bias=False, + D=D, + ), + ME.MinkowskiBatchNorm(512), + ME.MinkowskiReLU(), + conv(512, out_channels, kernel_size=1, bias=True, D=D), + ) + + def forward(self, x): + out = self.conv1p1s1(x) + out = self.bn1(out) + out = self.relu(out) + + out_b1p1 = self.block1(out) + + out = self.conv2p1s2(out_b1p1) + out = self.bn2(out) + out = self.relu(out) + + out_b2p2 = self.block2(out) + + out = self.conv3p2s2(out_b2p2) + out = self.bn3(out) + out = self.relu(out) + + out_b3p4 = self.block3(out) + + out = self.conv4p4s2(out_b3p4) + out = self.bn4(out) + out = self.relu(out) + + # pixel_dist=8 + out = self.block4(out) + + out = self.convtr4p8s2(out) + out = self.bntr4(out) + out = self.relu(out) + + out = me.cat(out, out_b3p4) + out = self.block5(out) + out_5 = self.pool_tr5(out) + + out = self.convtr5p4s2(out) + out = self.bntr5(out) + out = self.relu(out) + + out = me.cat(out, out_b2p2) + out = self.block6(out) + out_6 = self.pool_tr6(out) + + out = self.convtr6p2s2(out) + out = self.bntr6(out) + out = self.relu(out) + + out = me.cat(out, out_b1p1, out_6, out_5) + return self.final(out) + + +class MinkUNetHyper14INBN(MinkUNetHyper): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM + BLOCK = BasicBlockINBN + + +class STMinkUNetBase(MinkUNetBase): + + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + def __init__(self, in_channels, out_channels, config, D=4, **kwargs): + super().__init__(in_channels, out_channels, config, D, **kwargs) + + +class STResUNet14(STMinkUNetBase, ResUNet14): + pass + + +class STResUNet18(STMinkUNetBase, ResUNet18): + pass + + +class STResUNet34(STMinkUNetBase, ResUNet34): + pass + + +class STResUNet50(STMinkUNetBase, ResUNet50): + pass + + +class STResUNet101(STMinkUNetBase, ResUNet101): + pass + + +class STResTesseractUNetBase(STMinkUNetBase): + CONV_TYPE = ConvType.HYPERCUBE + + +class STResTesseractUNet14(STResTesseractUNetBase, ResUNet14): + pass + + +class STResTesseractUNet18(STResTesseractUNetBase, ResUNet18): + pass + + +class STResTesseractUNet34(STResTesseractUNetBase, ResUNet34): + pass + + +class STResTesseractUNet50(STResTesseractUNetBase, ResUNet50): + pass + + +class STResTesseractUNet101(STResTesseractUNetBase, ResUNet101): + pass diff --git a/models/Mask3D/build/lib/mask3d/models/wrapper.py b/models/Mask3D/build/lib/mask3d/models/wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..a6bf1678d2106049b8e6a2ac2f3a9aff37dcfc9c --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/models/wrapper.py @@ -0,0 +1,32 @@ +import random + +from torch.nn import Module +from MinkowskiEngine import SparseTensor + + +class Wrapper(Module): + """ + Wrapper for the segmentation networks. + """ + + OUT_PIXEL_DIST = -1 + + def __init__(self, NetClass, in_nchannel, out_nchannel, config): + super().__init__() + self.initialize_filter(NetClass, in_nchannel, out_nchannel, config) + + def initialize_filter(self, NetClass, in_nchannel, out_nchannel, config): + raise NotImplementedError("Must initialize a model and a filter") + + def forward(self, x, coords, colors=None): + soutput = self.model(x) + + # During training, make the network invariant to the filter + if not self.training or random.random() < 0.5: + # Filter requires the model to finish the forward pass + wrapper_coords = self.filter.initialize_coords( + self.model, coords, colors + ) + finput = SparseTensor(soutput.F, wrapper_coords) + soutput = self.filter(finput) + return soutput diff --git a/models/Mask3D/build/lib/mask3d/predict.py b/models/Mask3D/build/lib/mask3d/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..4c085fd01897c13540da8eac9f941dcf0847ca6f --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/predict.py @@ -0,0 +1,187 @@ +import hydra +from omegaconf import DictConfig, OmegaConf +from models.mask3d import Mask3D +import os +import torch + +import MinkowskiEngine as ME +import open3d as o3d +import numpy as np +import albumentations as A + +from utils.utils import ( + flatten_dict, + load_baseline_model, + load_checkpoint_with_missing_or_exsessive_keys, + load_backbone_checkpoint_with_missing_or_exsessive_keys, +) + +from datasets.scannet200.scannet200_constants import ( + SCANNET_COLOR_MAP_200, + SCANNET_COLOR_MAP_20, + VALID_CLASS_IDS_200, + VALID_CLASS_IDS_20, + CLASS_LABELS_200, + CLASS_LABELS_20, +) + +root_dir = '/home/weders/scratch/scratch/scannetter/arkit/raw/Validation' + +class InstanceSegmentation(torch.nn.Module): + def __init__(self, cfg): + super().__init__() + self.model = hydra.utils.instantiate(cfg.model) + + + def forward(self, x, raw_coordinates=None): + return self.model(x, raw_coordinates=raw_coordinates) + +@hydra.main( + config_path="conf", config_name="config_base_instance_segmentation.yaml" +) +def main(cfg: DictConfig): + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + os.chdir(hydra.utils.get_original_cwd()) + model = InstanceSegmentation(cfg) + + if cfg.general.backbone_checkpoint is not None: + cfg, model = load_backbone_checkpoint_with_missing_or_exsessive_keys( + cfg, model + ) + if cfg.general.checkpoint is not None: + cfg, model = load_checkpoint_with_missing_or_exsessive_keys(cfg, model) + + model = model.to(device) + # model.eval() + + color_mean = (0.47793125906962, 0.4303257521323044, 0.3749598901421883) + color_std = (0.2834475483823543, 0.27566157565723015, 0.27018971370874995) + normalize_color = A.Normalize(mean=color_mean, std=color_std) + + # iterate over data + for sc in os.listdir(root_dir): + + + if not os.path.exists(os.path.join(root_dir, sc, 'mesh_tsdf.ply')): + continue + + # save outputs + output_dir = os.path.join(root_dir, sc, 'pred_mask3d_ours') + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if sc != '42445991': + continue + + # if os.path.exists(os.path.join(output_dir, 'mask3d_predictions.txt')): + # print('Skipping', sc) + # continue + + print('Processing', sc) + + mesh = o3d.io.read_triangle_mesh(os.path.join(root_dir, sc, 'mesh_tsdf.ply')) + mesh.compute_vertex_normals() + + points = np.asarray(mesh.vertices) + colors = np.asarray(mesh.vertex_colors) + + + colors = colors * 255. + pseudo_image = colors.astype(np.uint8)[np.newaxis, :, :] + colors = np.squeeze(normalize_color(image=pseudo_image)["image"]) + + # voxelize data + coords = np.floor(points / 0.02) + + # maybe this change (_, _, ...) is not necessary and we can directly get out + # the sample coordinates? + _, _, unique_map, inverse_map = ME.utils.sparse_quantize(coordinates=coords, features=colors, return_index=True, return_inverse=True) + + sample_coordinates = coords[unique_map] + coordinates = [torch.from_numpy(sample_coordinates).int()] + sample_features = colors[unique_map] + features = [torch.from_numpy(sample_features).float()] + + coordinates, _ = ME.utils.sparse_collate(coords=coordinates, feats=features) + features = torch.cat(features, dim=0) + data = ME.SparseTensor( + coordinates=coordinates, + features=features, + device=device, + ) + + # run model + with torch.no_grad(): + outputs = model(data, raw_coordinates=features) + + del data + torch.cuda.empty_cache() + + # parse predictions + logits = outputs["pred_logits"] + masks = outputs["pred_masks"] + + + # reformat predictions + logits = logits[0].detach().cpu() + masks = masks[0].detach().cpu() + + labels = [] + confidences = [] + masks_binary = [] + + for i in range(len(logits)): + p_labels = torch.softmax(logits[i], dim=-1) + p_masks = torch.sigmoid(masks[:, i]) + l = torch.argmax(p_labels, dim=-1) + c_label = torch.max(p_labels) + m = p_masks > 0.5 + c_m = p_masks[m].sum() / (m.sum() + 1e-8) + c = c_label * c_m + if l < 200 and c > 0.5: + labels.append(l.item()) + confidences.append(c.item()) + masks_binary.append(m[inverse_map]) # mapping the mask back to the original point cloud + + + # save labelled mesh + mesh_labelled = o3d.geometry.TriangleMesh() + mesh_labelled.vertices = mesh.vertices + mesh_labelled.triangles = mesh.triangles + + labels_mapped = np.zeros((len(mesh.vertices), 1)) + colors_mapped = np.zeros((len(mesh.vertices), 3)) + + confidences, labels, masks_binary = zip(*sorted(zip(confidences, labels, masks_binary), reverse=False)) + for i, (l, c, m) in enumerate(zip(labels, confidences, masks_binary)): + labels_mapped[m == 1] = l + if l == 0: + l_ = -1 + 2 # label offset is 2 for scannet 200, 0 needs to be mapped to -1 before (see trainer.py in Mask3D) + else: + l_ = l + 2 + # print(VALID_CLASS_IDS_200[l_], SCANNET_COLOR_MAP_200[VALID_CLASS_IDS_200[l_]], l_, CLASS_LABELS_200[l_]) + colors_mapped[m == 1] = SCANNET_COLOR_MAP_200[VALID_CLASS_IDS_200[l_]] + + # colors_mapped[mask_mapped == 1] = SCANNET_COLOR_MAP_200[VALID_CLASS_IDS_200[l]] + + + + + mesh_labelled.vertex_colors = o3d.utility.Vector3dVector(colors_mapped.astype(np.float32) / 255.) + o3d.io.write_triangle_mesh(f'{output_dir}/mesh_tsdf_labelled.ply', mesh_labelled) + + mask_path = os.path.join(output_dir, 'pred_mask') + if not os.path.exists(mask_path): + os.makedirs(mask_path) + + # sorting by confidence + with open(os.path.join(output_dir, 'mask3d_predictions.txt'), 'w') as f: + for i, (l, c, m) in enumerate(zip(labels, confidences, masks_binary)): + mask_file = f'pred_mask/{str(i).zfill(3)}.txt' + f.write(f'{mask_file} {VALID_CLASS_IDS_200[l]} {c}\n') + np.savetxt(os.path.join(output_dir, mask_file), m.numpy(), fmt='%d') + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/models/Mask3D/build/lib/mask3d/preprocess_arkitscenes.py b/models/Mask3D/build/lib/mask3d/preprocess_arkitscenes.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/trainer/__init__.py b/models/Mask3D/build/lib/mask3d/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/trainer/trainer.py b/models/Mask3D/build/lib/mask3d/trainer/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..b794e38aa5b2cef7eb106f95ced43466768b3dba --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/trainer/trainer.py @@ -0,0 +1,1302 @@ +import gc +from contextlib import nullcontext +from pathlib import Path +import statistics +import shutil +import os +import math +import pyviz3d.visualizer as vis +from torch_scatter import scatter_mean +import matplotlib +from benchmark.evaluate_semantic_instance import evaluate +from collections import defaultdict +from sklearn.cluster import DBSCAN +from utils.votenet_utils.eval_det import eval_det +from datasets.scannet200.scannet200_splits import ( + HEAD_CATS_SCANNET_200, + TAIL_CATS_SCANNET_200, + COMMON_CATS_SCANNET_200, + VALID_CLASS_IDS_200_VALIDATION, +) + +import hydra +import MinkowskiEngine as ME +import numpy as np +import pytorch_lightning as pl +import torch +from models.metrics import IoU +import random +import colorsys +from typing import List, Tuple +import functools + + +@functools.lru_cache(20) +def get_evenly_distributed_colors( + count: int, +) -> List[Tuple[np.uint8, np.uint8, np.uint8]]: + # lru cache caches color tuples + HSV_tuples = [(x / count, 1.0, 1.0) for x in range(count)] + random.shuffle(HSV_tuples) + return list( + map( + lambda x: (np.array(colorsys.hsv_to_rgb(*x)) * 255).astype( + np.uint8 + ), + HSV_tuples, + ) + ) + + +class RegularCheckpointing(pl.Callback): + def on_train_epoch_end( + self, trainer: "pl.Trainer", pl_module: "pl.LightningModule" + ): + general = pl_module.config.general + trainer.save_checkpoint(f"{general.save_dir}/last-epoch.ckpt") + print("Checkpoint created") + + +class InstanceSegmentation(pl.LightningModule): + def __init__(self, config): + super().__init__() + + self.decoder_id = config.general.decoder_id + + if config.model.train_on_segments: + self.mask_type = "segment_mask" + else: + self.mask_type = "masks" + + self.eval_on_segments = config.general.eval_on_segments + + self.config = config + self.save_hyperparameters() + # model + self.model = hydra.utils.instantiate(config.model) + self.optional_freeze = nullcontext + if config.general.freeze_backbone: + self.optional_freeze = torch.no_grad + # loss + self.ignore_label = config.data.ignore_label + + matcher = hydra.utils.instantiate(config.matcher) + weight_dict = { + "loss_ce": matcher.cost_class, + "loss_mask": matcher.cost_mask, + "loss_dice": matcher.cost_dice, + } + + aux_weight_dict = {} + for i in range(self.model.num_levels * self.model.num_decoders): + if i not in self.config.general.ignore_mask_idx: + aux_weight_dict.update( + {k + f"_{i}": v for k, v in weight_dict.items()} + ) + else: + aux_weight_dict.update( + {k + f"_{i}": 0.0 for k, v in weight_dict.items()} + ) + weight_dict.update(aux_weight_dict) + + self.preds = dict() + self.bbox_preds = dict() + self.bbox_gt = dict() + + self.criterion = hydra.utils.instantiate( + config.loss, matcher=matcher, weight_dict=weight_dict + ) + + # metrics + self.confusion = hydra.utils.instantiate(config.metrics) + self.iou = IoU() + # misc + self.labels_info = dict() + + def forward( + self, x, point2segment=None, raw_coordinates=None, is_eval=False + ): + with self.optional_freeze(): + x = self.model( + x, + point2segment, + raw_coordinates=raw_coordinates, + is_eval=is_eval, + ) + return x + + def training_step(self, batch, batch_idx): + data, target, file_names = batch + + if data.features.shape[0] > self.config.general.max_batch_size: + print("data exceeds threshold") + raise RuntimeError("BATCH TOO BIG") + + if len(target) == 0: + print("no targets") + return None + + raw_coordinates = None + if self.config.data.add_raw_coordinates: + raw_coordinates = data.features[:, -3:] + data.features = data.features[:, :-3] + + data = ME.SparseTensor( + coordinates=data.coordinates, + features=data.features, + device=self.device, + ) + + try: + output = self.forward( + data, + point2segment=[ + target[i]["point2segment"] for i in range(len(target)) + ], + raw_coordinates=raw_coordinates, + ) + except RuntimeError as run_err: + print(run_err) + if ( + "only a single point gives nans in cross-attention" + == run_err.args[0] + ): + return None + else: + raise run_err + + try: + losses = self.criterion(output, target, mask_type=self.mask_type) + except ValueError as val_err: + print(f"ValueError: {val_err}") + print(f"data shape: {data.shape}") + print(f"data feat shape: {data.features.shape}") + print(f"data feat nans: {data.features.isnan().sum()}") + print(f"output: {output}") + print(f"target: {target}") + print(f"filenames: {file_names}") + raise val_err + + for k in list(losses.keys()): + if k in self.criterion.weight_dict: + losses[k] *= self.criterion.weight_dict[k] + else: + # remove this loss if not specified in `weight_dict` + losses.pop(k) + + logs = { + f"train_{k}": v.detach().cpu().item() for k, v in losses.items() + } + + logs["train_mean_loss_ce"] = statistics.mean( + [item for item in [v for k, v in logs.items() if "loss_ce" in k]] + ) + + logs["train_mean_loss_mask"] = statistics.mean( + [item for item in [v for k, v in logs.items() if "loss_mask" in k]] + ) + + logs["train_mean_loss_dice"] = statistics.mean( + [item for item in [v for k, v in logs.items() if "loss_dice" in k]] + ) + + self.log_dict(logs) + return sum(losses.values()) + + def validation_step(self, batch, batch_idx): + return self.eval_step(batch, batch_idx) + + def export(self, pred_masks, scores, pred_classes, file_names, decoder_id): + root_path = f"eval_output" + base_path = f"{root_path}/instance_evaluation_{self.config.general.experiment_name}_{self.current_epoch}/decoder_{decoder_id}" + pred_mask_path = f"{base_path}/pred_mask" + + Path(pred_mask_path).mkdir(parents=True, exist_ok=True) + + file_name = file_names + with open(f"{base_path}/{file_name}.txt", "w") as fout: + real_id = -1 + for instance_id in range(len(pred_classes)): + real_id += 1 + pred_class = pred_classes[instance_id] + score = scores[instance_id] + mask = pred_masks[:, instance_id].astype("uint8") + + if score > self.config.general.export_threshold: + # reduce the export size a bit. I guess no performance difference + np.savetxt( + f"{pred_mask_path}/{file_name}_{real_id}.txt", + mask, + fmt="%d", + ) + fout.write( + f"pred_mask/{file_name}_{real_id}.txt {pred_class} {score}\n" + ) + + def training_epoch_end(self, outputs): + train_loss = sum([out["loss"].cpu().item() for out in outputs]) / len( + outputs + ) + results = {"train_loss_mean": train_loss} + self.log_dict(results) + + def validation_epoch_end(self, outputs): + self.test_epoch_end(outputs) + + def save_visualizations( + self, + target_full, + full_res_coords, + sorted_masks, + sort_classes, + file_name, + original_colors, + original_normals, + sort_scores_values, + point_size=20, + sorted_heatmaps=None, + query_pos=None, + backbone_features=None, + ): + + full_res_coords -= full_res_coords.mean(axis=0) + + gt_pcd_pos = [] + gt_pcd_normals = [] + gt_pcd_color = [] + gt_inst_pcd_color = [] + gt_boxes = [] + + if "labels" in target_full: + instances_colors = torch.from_numpy( + np.vstack( + get_evenly_distributed_colors( + target_full["labels"].shape[0] + ) + ) + ) + for instance_counter, (label, mask) in enumerate( + zip(target_full["labels"], target_full["masks"]) + ): + if label == 255: + continue + + mask_tmp = mask.detach().cpu().numpy() + mask_coords = full_res_coords[mask_tmp.astype(bool), :] + + if len(mask_coords) == 0: + continue + + gt_pcd_pos.append(mask_coords) + mask_coords_min = full_res_coords[ + mask_tmp.astype(bool), : + ].min(axis=0) + mask_coords_max = full_res_coords[ + mask_tmp.astype(bool), : + ].max(axis=0) + size = mask_coords_max - mask_coords_min + mask_coords_middle = mask_coords_min + size / 2 + + gt_boxes.append( + { + "position": mask_coords_middle, + "size": size, + "color": self.validation_dataset.map2color([label])[0], + } + ) + + gt_pcd_color.append( + self.validation_dataset.map2color([label]).repeat( + gt_pcd_pos[-1].shape[0], 1 + ) + ) + gt_inst_pcd_color.append( + instances_colors[instance_counter % len(instances_colors)] + .unsqueeze(0) + .repeat(gt_pcd_pos[-1].shape[0], 1) + ) + + gt_pcd_normals.append( + original_normals[mask_tmp.astype(bool), :] + ) + + gt_pcd_pos = np.concatenate(gt_pcd_pos) + gt_pcd_normals = np.concatenate(gt_pcd_normals) + gt_pcd_color = np.concatenate(gt_pcd_color) + gt_inst_pcd_color = np.concatenate(gt_inst_pcd_color) + + v = vis.Visualizer() + + v.add_points( + "RGB Input", + full_res_coords, + colors=original_colors, + normals=original_normals, + visible=True, + point_size=point_size, + ) + + if backbone_features is not None: + v.add_points( + "PCA", + full_res_coords, + colors=backbone_features, + normals=original_normals, + visible=False, + point_size=point_size, + ) + + if "labels" in target_full: + v.add_points( + "Semantics (GT)", + gt_pcd_pos, + colors=gt_pcd_color, + normals=gt_pcd_normals, + alpha=0.8, + visible=False, + point_size=point_size, + ) + v.add_points( + "Instances (GT)", + gt_pcd_pos, + colors=gt_inst_pcd_color, + normals=gt_pcd_normals, + alpha=0.8, + visible=False, + point_size=point_size, + ) + + pred_coords = [] + pred_normals = [] + pred_sem_color = [] + pred_inst_color = [] + + for did in range(len(sorted_masks)): + instances_colors = torch.from_numpy( + np.vstack( + get_evenly_distributed_colors( + max(1, sorted_masks[did].shape[1]) + ) + ) + ) + + for i in reversed(range(sorted_masks[did].shape[1])): + coords = full_res_coords[ + sorted_masks[did][:, i].astype(bool), : + ] + + mask_coords = full_res_coords[ + sorted_masks[did][:, i].astype(bool), : + ] + mask_normals = original_normals[ + sorted_masks[did][:, i].astype(bool), : + ] + + label = sort_classes[did][i] + + if len(mask_coords) == 0: + continue + + pred_coords.append(mask_coords) + pred_normals.append(mask_normals) + + pred_sem_color.append( + self.validation_dataset.map2color([label]).repeat( + mask_coords.shape[0], 1 + ) + ) + + pred_inst_color.append( + instances_colors[i % len(instances_colors)] + .unsqueeze(0) + .repeat(mask_coords.shape[0], 1) + ) + + if len(pred_coords) > 0: + pred_coords = np.concatenate(pred_coords) + pred_normals = np.concatenate(pred_normals) + pred_sem_color = np.concatenate(pred_sem_color) + pred_inst_color = np.concatenate(pred_inst_color) + + v.add_points( + "Semantics (Mask3D)", + pred_coords, + colors=pred_sem_color, + normals=pred_normals, + visible=False, + alpha=0.8, + point_size=point_size, + ) + v.add_points( + "Instances (Mask3D)", + pred_coords, + colors=pred_inst_color, + normals=pred_normals, + visible=False, + alpha=0.8, + point_size=point_size, + ) + + v.save( + f"{self.config['general']['save_dir']}/visualizations/{file_name}" + ) + + def eval_step(self, batch, batch_idx): + data, target, file_names = batch + inverse_maps = data.inverse_maps + target_full = data.target_full + original_colors = data.original_colors + data_idx = data.idx + original_normals = data.original_normals + original_coordinates = data.original_coordinates + + # if len(target) == 0 or len(target_full) == 0: + # print("no targets") + # return None + + if len(data.coordinates) == 0: + return 0.0 + + raw_coordinates = None + if self.config.data.add_raw_coordinates: + raw_coordinates = data.features[:, -3:] + data.features = data.features[:, :-3] + + if raw_coordinates.shape[0] == 0: + return 0.0 + + data = ME.SparseTensor( + coordinates=data.coordinates, + features=data.features, + device=self.device, + ) + + try: + output = self.forward( + data, + point2segment=[ + target[i]["point2segment"] for i in range(len(target)) + ], + raw_coordinates=raw_coordinates, + is_eval=True, + ) + except RuntimeError as run_err: + print(run_err) + if ( + "only a single point gives nans in cross-attention" + == run_err.args[0] + ): + return None + else: + raise run_err + + if self.config.data.test_mode != "test": + if self.config.trainer.deterministic: + torch.use_deterministic_algorithms(False) + + try: + losses = self.criterion( + output, target, mask_type=self.mask_type + ) + except ValueError as val_err: + print(f"ValueError: {val_err}") + print(f"data shape: {data.shape}") + print(f"data feat shape: {data.features.shape}") + print(f"data feat nans: {data.features.isnan().sum()}") + print(f"output: {output}") + print(f"target: {target}") + print(f"filenames: {file_names}") + raise val_err + + for k in list(losses.keys()): + if k in self.criterion.weight_dict: + losses[k] *= self.criterion.weight_dict[k] + else: + # remove this loss if not specified in `weight_dict` + losses.pop(k) + if self.config.trainer.deterministic: + torch.use_deterministic_algorithms(True) + + if self.config.general.save_visualizations: + backbone_features = ( + output["backbone_features"].F.detach().cpu().numpy() + ) + from sklearn import decomposition + + pca = decomposition.PCA(n_components=3) + pca.fit(backbone_features) + pca_features = pca.transform(backbone_features) + rescaled_pca = ( + 255 + * (pca_features - pca_features.min()) + / (pca_features.max() - pca_features.min()) + ) + + self.eval_instance_step( + output, + target, + target_full, + inverse_maps, + file_names, + original_coordinates, + original_colors, + original_normals, + raw_coordinates, + data_idx, + backbone_features=rescaled_pca + if self.config.general.save_visualizations + else None, + ) + + if self.config.data.test_mode != "test": + return { + f"val_{k}": v.detach().cpu().item() for k, v in losses.items() + } + else: + return 0.0 + + def test_step(self, batch, batch_idx): + return self.eval_step(batch, batch_idx) + + def get_full_res_mask( + self, mask, inverse_map, point2segment_full, is_heatmap=False + ): + mask = mask.detach().cpu()[inverse_map] # full res + + if self.eval_on_segments and is_heatmap == False: + mask = scatter_mean( + mask, point2segment_full, dim=0 + ) # full res segments + mask = (mask > 0.5).float() + mask = mask.detach().cpu()[ + point2segment_full.cpu() + ] # full res points + + return mask + + def get_mask_and_scores( + self, mask_cls, mask_pred, num_queries=100, num_classes=18, device=None + ): + if device is None: + device = self.device + labels = ( + torch.arange(num_classes, device=device) + .unsqueeze(0) + .repeat(num_queries, 1) + .flatten(0, 1) + ) + + if self.config.general.topk_per_image != -1: + scores_per_query, topk_indices = mask_cls.flatten(0, 1).topk( + self.config.general.topk_per_image, sorted=True + ) + else: + scores_per_query, topk_indices = mask_cls.flatten(0, 1).topk( + num_queries, sorted=True + ) + + labels_per_query = labels[topk_indices] + topk_indices = topk_indices // num_classes + mask_pred = mask_pred[:, topk_indices] + + result_pred_mask = (mask_pred > 0).float() + heatmap = mask_pred.float().sigmoid() + + mask_scores_per_image = (heatmap * result_pred_mask).sum(0) / ( + result_pred_mask.sum(0) + 1e-6 + ) + score = scores_per_query * mask_scores_per_image + classes = labels_per_query + + return score, result_pred_mask, classes, heatmap + + def eval_instance_step( + self, + output, + target_low_res, + target_full_res, + inverse_maps, + file_names, + full_res_coords, + original_colors, + original_normals, + raw_coords, + idx, + first_full_res=False, + backbone_features=None, + ): + label_offset = self.validation_dataset.label_offset + prediction = output["aux_outputs"] + prediction.append( + { + "pred_logits": output["pred_logits"], + "pred_masks": output["pred_masks"], + } + ) + + prediction[self.decoder_id][ + "pred_logits" + ] = torch.functional.F.softmax( + prediction[self.decoder_id]["pred_logits"], dim=-1 + )[ + ..., :-1 + ] + + all_pred_classes = list() + all_pred_masks = list() + all_pred_scores = list() + all_heatmaps = list() + all_query_pos = list() + + offset_coords_idx = 0 + for bid in range(len(prediction[self.decoder_id]["pred_masks"])): + if not first_full_res: + if self.model.train_on_segments: + masks = ( + prediction[self.decoder_id]["pred_masks"][bid] + .detach() + .cpu()[target_low_res[bid]["point2segment"].cpu()] + ) + else: + masks = ( + prediction[self.decoder_id]["pred_masks"][bid] + .detach() + .cpu() + ) + + if self.config.general.use_dbscan: + new_preds = { + "pred_masks": list(), + "pred_logits": list(), + } + + curr_coords_idx = masks.shape[0] + curr_coords = raw_coords[ + offset_coords_idx : curr_coords_idx + offset_coords_idx + ] + offset_coords_idx += curr_coords_idx + + for curr_query in range(masks.shape[1]): + curr_masks = masks[:, curr_query] > 0 + + if curr_coords[curr_masks].shape[0] > 0: + clusters = ( + DBSCAN( + eps=self.config.general.dbscan_eps, + min_samples=self.config.general.dbscan_min_points, + n_jobs=-1, + ) + .fit(curr_coords[curr_masks]) + .labels_ + ) + + new_mask = torch.zeros(curr_masks.shape, dtype=int) + new_mask[curr_masks] = ( + torch.from_numpy(clusters) + 1 + ) + + for cluster_id in np.unique(clusters): + original_pred_masks = masks[:, curr_query] + if cluster_id != -1: + new_preds["pred_masks"].append( + original_pred_masks + * (new_mask == cluster_id + 1) + ) + new_preds["pred_logits"].append( + prediction[self.decoder_id][ + "pred_logits" + ][bid, curr_query] + ) + + scores, masks, classes, heatmap = self.get_mask_and_scores( + torch.stack(new_preds["pred_logits"]).cpu(), + torch.stack(new_preds["pred_masks"]).T, + len(new_preds["pred_logits"]), + self.model.num_classes - 1, + ) + else: + scores, masks, classes, heatmap = self.get_mask_and_scores( + prediction[self.decoder_id]["pred_logits"][bid] + .detach() + .cpu(), + masks, + prediction[self.decoder_id]["pred_logits"][bid].shape[ + 0 + ], + self.model.num_classes - 1, + ) + + masks = self.get_full_res_mask( + masks, + inverse_maps[bid], + target_full_res[bid]["point2segment"], + ) + + heatmap = self.get_full_res_mask( + heatmap, + inverse_maps[bid], + target_full_res[bid]["point2segment"], + is_heatmap=True, + ) + + if backbone_features is not None: + backbone_features = self.get_full_res_mask( + torch.from_numpy(backbone_features), + inverse_maps[bid], + target_full_res[bid]["point2segment"], + is_heatmap=True, + ) + backbone_features = backbone_features.numpy() + else: + assert False, "not tested" + masks = self.get_full_res_mask( + prediction[self.decoder_id]["pred_masks"][bid].cpu(), + inverse_maps[bid], + target_full_res[bid]["point2segment"], + ) + + scores, masks, classes, heatmap = self.get_mask_and_scores( + prediction[self.decoder_id]["pred_logits"][bid].cpu(), + masks, + prediction[self.decoder_id]["pred_logits"][bid].shape[0], + self.model.num_classes - 1, + device="cpu", + ) + + masks = masks.numpy() + heatmap = heatmap.numpy() + + sort_scores = scores.sort(descending=True) + sort_scores_index = sort_scores.indices.cpu().numpy() + sort_scores_values = sort_scores.values.cpu().numpy() + sort_classes = classes[sort_scores_index] + + sorted_masks = masks[:, sort_scores_index] + sorted_heatmap = heatmap[:, sort_scores_index] + + if self.config.general.filter_out_instances: + keep_instances = set() + pairwise_overlap = sorted_masks.T @ sorted_masks + normalization = pairwise_overlap.max(axis=0) + norm_overlaps = pairwise_overlap / normalization + + for instance_id in range(norm_overlaps.shape[0]): + # filter out unlikely masks and nearly empty masks + # if not(sort_scores_values[instance_id] < 0.3 or sorted_masks[:, instance_id].sum() < 500): + if not ( + sort_scores_values[instance_id] + < self.config.general.scores_threshold + ): + # check if mask != empty + if not sorted_masks[:, instance_id].sum() == 0.0: + overlap_ids = set( + np.nonzero( + norm_overlaps[instance_id, :] + > self.config.general.iou_threshold + )[0] + ) + + if len(overlap_ids) == 0: + keep_instances.add(instance_id) + else: + if instance_id == min(overlap_ids): + keep_instances.add(instance_id) + + keep_instances = sorted(list(keep_instances)) + all_pred_classes.append(sort_classes[keep_instances]) + all_pred_masks.append(sorted_masks[:, keep_instances]) + all_pred_scores.append(sort_scores_values[keep_instances]) + all_heatmaps.append(sorted_heatmap[:, keep_instances]) + else: + all_pred_classes.append(sort_classes) + all_pred_masks.append(sorted_masks) + all_pred_scores.append(sort_scores_values) + all_heatmaps.append(sorted_heatmap) + + if self.validation_dataset.dataset_name == "scannet200": + all_pred_classes[bid][all_pred_classes[bid] == 0] = -1 + if self.config.data.test_mode != "test": + target_full_res[bid]["labels"][ + target_full_res[bid]["labels"] == 0 + ] = -1 + + for bid in range(len(prediction[self.decoder_id]["pred_masks"])): + all_pred_classes[ + bid + ] = self.validation_dataset._remap_model_output( + all_pred_classes[bid].cpu() + label_offset + ) + + if ( + self.config.data.test_mode != "test" + and len(target_full_res) != 0 + ): + target_full_res[bid][ + "labels" + ] = self.validation_dataset._remap_model_output( + target_full_res[bid]["labels"].cpu() + label_offset + ) + + # PREDICTION BOX + bbox_data = [] + for query_id in range( + all_pred_masks[bid].shape[1] + ): # self.model.num_queries + obj_coords = full_res_coords[bid][ + all_pred_masks[bid][:, query_id].astype(bool), : + ] + if obj_coords.shape[0] > 0: + obj_center = obj_coords.mean(axis=0) + obj_axis_length = obj_coords.max( + axis=0 + ) - obj_coords.min(axis=0) + + bbox = np.concatenate((obj_center, obj_axis_length)) + + bbox_data.append( + ( + all_pred_classes[bid][query_id].item(), + bbox, + all_pred_scores[bid][query_id], + ) + ) + self.bbox_preds[file_names[bid]] = bbox_data + + # GT BOX + bbox_data = [] + for obj_id in range(target_full_res[bid]["masks"].shape[0]): + if target_full_res[bid]["labels"][obj_id].item() == 255: + continue + + obj_coords = full_res_coords[bid][ + target_full_res[bid]["masks"][obj_id, :] + .cpu() + .detach() + .numpy() + .astype(bool), + :, + ] + if obj_coords.shape[0] > 0: + obj_center = obj_coords.mean(axis=0) + obj_axis_length = obj_coords.max( + axis=0 + ) - obj_coords.min(axis=0) + + bbox = np.concatenate((obj_center, obj_axis_length)) + bbox_data.append( + ( + target_full_res[bid]["labels"][obj_id].item(), + bbox, + ) + ) + + self.bbox_gt[file_names[bid]] = bbox_data + + if self.config.general.eval_inner_core == -1: + self.preds[file_names[bid]] = { + "pred_masks": all_pred_masks[bid], + "pred_scores": all_pred_scores[bid], + "pred_classes": all_pred_classes[bid], + } + else: + # prev val_dataset + self.preds[file_names[bid]] = { + "pred_masks": all_pred_masks[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + "pred_scores": all_pred_scores[bid], + "pred_classes": all_pred_classes[bid], + } + + if self.config.general.save_visualizations: + if "cond_inner" in self.test_dataset.data[idx[bid]]: + target_full_res[bid]["masks"] = target_full_res[bid][ + "masks" + ][:, self.test_dataset.data[idx[bid]]["cond_inner"]] + self.save_visualizations( + target_full_res[bid], + full_res_coords[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + [self.preds[file_names[bid]]["pred_masks"]], + [self.preds[file_names[bid]]["pred_classes"]], + file_names[bid], + original_colors[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + original_normals[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + [self.preds[file_names[bid]]["pred_scores"]], + sorted_heatmaps=[ + all_heatmaps[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ] + ], + query_pos=all_query_pos[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ] + if len(all_query_pos) > 0 + else None, + backbone_features=backbone_features[ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + point_size=self.config.general.visualization_point_size, + ) + else: + self.save_visualizations( + target_full_res[bid], + full_res_coords[bid], + [self.preds[file_names[bid]]["pred_masks"]], + [self.preds[file_names[bid]]["pred_classes"]], + file_names[bid], + original_colors[bid], + original_normals[bid], + [self.preds[file_names[bid]]["pred_scores"]], + sorted_heatmaps=[all_heatmaps[bid]], + query_pos=all_query_pos[bid] + if len(all_query_pos) > 0 + else None, + backbone_features=backbone_features, + point_size=self.config.general.visualization_point_size, + ) + + if self.config.general.export: + if self.validation_dataset.dataset_name == "stpls3d": + scan_id, _, _, crop_id = file_names[bid].split("_") + crop_id = int(crop_id.replace(".txt", "")) + file_name = ( + f"{scan_id}_points_GTv3_0{crop_id}_inst_nostuff" + ) + + self.export( + self.preds[file_names[bid]]["pred_masks"], + self.preds[file_names[bid]]["pred_scores"], + self.preds[file_names[bid]]["pred_classes"], + file_name, + self.decoder_id, + ) + else: + self.export( + self.preds[file_names[bid]]["pred_masks"], + self.preds[file_names[bid]]["pred_scores"], + self.preds[file_names[bid]]["pred_classes"], + file_names[bid], + self.decoder_id, + ) + + def eval_instance_epoch_end(self): + log_prefix = f"val" + ap_results = {} + + head_results, tail_results, common_results = [], [], [] + + box_ap_50 = eval_det( + self.bbox_preds, self.bbox_gt, ovthresh=0.5, use_07_metric=False + ) + box_ap_25 = eval_det( + self.bbox_preds, self.bbox_gt, ovthresh=0.25, use_07_metric=False + ) + mean_box_ap_25 = sum([v for k, v in box_ap_25[-1].items()]) / len( + box_ap_25[-1].keys() + ) + mean_box_ap_50 = sum([v for k, v in box_ap_50[-1].items()]) / len( + box_ap_50[-1].keys() + ) + + ap_results[f"{log_prefix}_mean_box_ap_25"] = mean_box_ap_25 + ap_results[f"{log_prefix}_mean_box_ap_50"] = mean_box_ap_50 + + for class_id in box_ap_50[-1].keys(): + class_name = self.train_dataset.label_info[class_id]["name"] + ap_results[f"{log_prefix}_{class_name}_val_box_ap_50"] = box_ap_50[ + -1 + ][class_id] + + for class_id in box_ap_25[-1].keys(): + class_name = self.train_dataset.label_info[class_id]["name"] + ap_results[f"{log_prefix}_{class_name}_val_box_ap_25"] = box_ap_25[ + -1 + ][class_id] + + root_path = f"eval_output" + base_path = f"{root_path}/instance_evaluation_{self.config.general.experiment_name}_{self.current_epoch}" + + if self.validation_dataset.dataset_name in [ + "scannet", + "stpls3d", + "scannet200", + ]: + gt_data_path = f"{self.validation_dataset.data_dir[0]}/instance_gt/{self.validation_dataset.mode}" + else: + gt_data_path = f"{self.validation_dataset.data_dir[0]}/instance_gt/Area_{self.config.general.area}" + + pred_path = f"{base_path}/tmp_output.txt" + + log_prefix = f"val" + + if not os.path.exists(base_path): + os.makedirs(base_path) + + try: + if self.validation_dataset.dataset_name == "s3dis": + new_preds = {} + for key in self.preds.keys(): + new_preds[ + key.replace(f"Area_{self.config.general.area}_", "") + ] = { + "pred_classes": self.preds[key]["pred_classes"] + 1, + "pred_masks": self.preds[key]["pred_masks"], + "pred_scores": self.preds[key]["pred_scores"], + } + mprec, mrec = evaluate( + new_preds, gt_data_path, pred_path, dataset="s3dis" + ) + ap_results[f"{log_prefix}_mean_precision"] = mprec + ap_results[f"{log_prefix}_mean_recall"] = mrec + elif self.validation_dataset.dataset_name == "stpls3d": + new_preds = {} + for key in self.preds.keys(): + new_preds[key.replace(".txt", "")] = { + "pred_classes": self.preds[key]["pred_classes"], + "pred_masks": self.preds[key]["pred_masks"], + "pred_scores": self.preds[key]["pred_scores"], + } + + evaluate(new_preds, gt_data_path, pred_path, dataset="stpls3d") + else: + evaluate( + self.preds, + gt_data_path, + pred_path, + dataset=self.validation_dataset.dataset_name, + ) + with open(pred_path, "r") as fin: + for line_id, line in enumerate(fin): + if line_id == 0: + # ignore header + continue + class_name, _, ap, ap_50, ap_25 = line.strip().split(",") + + if self.validation_dataset.dataset_name == "scannet200": + if class_name in VALID_CLASS_IDS_200_VALIDATION: + ap_results[ + f"{log_prefix}_{class_name}_val_ap" + ] = float(ap) + ap_results[ + f"{log_prefix}_{class_name}_val_ap_50" + ] = float(ap_50) + ap_results[ + f"{log_prefix}_{class_name}_val_ap_25" + ] = float(ap_25) + + if class_name in HEAD_CATS_SCANNET_200: + head_results.append( + np.array( + (float(ap), float(ap_50), float(ap_25)) + ) + ) + elif class_name in COMMON_CATS_SCANNET_200: + common_results.append( + np.array( + (float(ap), float(ap_50), float(ap_25)) + ) + ) + elif class_name in TAIL_CATS_SCANNET_200: + tail_results.append( + np.array( + (float(ap), float(ap_50), float(ap_25)) + ) + ) + else: + assert (False, "class not known!") + else: + ap_results[ + f"{log_prefix}_{class_name}_val_ap" + ] = float(ap) + ap_results[ + f"{log_prefix}_{class_name}_val_ap_50" + ] = float(ap_50) + ap_results[ + f"{log_prefix}_{class_name}_val_ap_25" + ] = float(ap_25) + + if self.validation_dataset.dataset_name == "scannet200": + head_results = np.stack(head_results) + common_results = np.stack(common_results) + tail_results = np.stack(tail_results) + + mean_tail_results = np.nanmean(tail_results, axis=0) + mean_common_results = np.nanmean(common_results, axis=0) + mean_head_results = np.nanmean(head_results, axis=0) + + ap_results[ + f"{log_prefix}_mean_tail_ap_25" + ] = mean_tail_results[0] + ap_results[ + f"{log_prefix}_mean_common_ap_25" + ] = mean_common_results[0] + ap_results[ + f"{log_prefix}_mean_head_ap_25" + ] = mean_head_results[0] + + ap_results[ + f"{log_prefix}_mean_tail_ap_50" + ] = mean_tail_results[1] + ap_results[ + f"{log_prefix}_mean_common_ap_50" + ] = mean_common_results[1] + ap_results[ + f"{log_prefix}_mean_head_ap_50" + ] = mean_head_results[1] + + ap_results[ + f"{log_prefix}_mean_tail_ap_25" + ] = mean_tail_results[2] + ap_results[ + f"{log_prefix}_mean_common_ap_25" + ] = mean_common_results[2] + ap_results[ + f"{log_prefix}_mean_head_ap_25" + ] = mean_head_results[2] + + overall_ap_results = np.nanmean( + np.vstack((head_results, common_results, tail_results)), + axis=0, + ) + + ap_results[f"{log_prefix}_mean_ap"] = overall_ap_results[0] + ap_results[f"{log_prefix}_mean_ap_50"] = overall_ap_results[1] + ap_results[f"{log_prefix}_mean_ap_25"] = overall_ap_results[2] + + ap_results = { + key: 0.0 if math.isnan(score) else score + for key, score in ap_results.items() + } + else: + mean_ap = statistics.mean( + [ + item + for key, item in ap_results.items() + if key.endswith("val_ap") + ] + ) + mean_ap_50 = statistics.mean( + [ + item + for key, item in ap_results.items() + if key.endswith("val_ap_50") + ] + ) + mean_ap_25 = statistics.mean( + [ + item + for key, item in ap_results.items() + if key.endswith("val_ap_25") + ] + ) + + ap_results[f"{log_prefix}_mean_ap"] = mean_ap + ap_results[f"{log_prefix}_mean_ap_50"] = mean_ap_50 + ap_results[f"{log_prefix}_mean_ap_25"] = mean_ap_25 + + ap_results = { + key: 0.0 if math.isnan(score) else score + for key, score in ap_results.items() + } + except (IndexError, OSError) as e: + print("NO SCORES!!!") + ap_results[f"{log_prefix}_mean_ap"] = 0.0 + ap_results[f"{log_prefix}_mean_ap_50"] = 0.0 + ap_results[f"{log_prefix}_mean_ap_25"] = 0.0 + + self.log_dict(ap_results) + + if not self.config.general.export: + shutil.rmtree(base_path) + + del self.preds + del self.bbox_preds + del self.bbox_gt + + gc.collect() + + self.preds = dict() + self.bbox_preds = dict() + self.bbox_gt = dict() + + def test_epoch_end(self, outputs): + if self.config.general.export: + return + + self.eval_instance_epoch_end() + + dd = defaultdict(list) + for output in outputs: + for key, val in output.items(): # .items() in Python 3. + dd[key].append(val) + + dd = {k: statistics.mean(v) for k, v in dd.items()} + + dd["val_mean_loss_ce"] = statistics.mean( + [item for item in [v for k, v in dd.items() if "loss_ce" in k]] + ) + dd["val_mean_loss_mask"] = statistics.mean( + [item for item in [v for k, v in dd.items() if "loss_mask" in k]] + ) + dd["val_mean_loss_dice"] = statistics.mean( + [item for item in [v for k, v in dd.items() if "loss_dice" in k]] + ) + + self.log_dict(dd) + + def configure_optimizers(self): + optimizer = hydra.utils.instantiate( + self.config.optimizer, params=self.parameters() + ) + if "steps_per_epoch" in self.config.scheduler.scheduler.keys(): + self.config.scheduler.scheduler.steps_per_epoch = len( + self.train_dataloader() + ) + lr_scheduler = hydra.utils.instantiate( + self.config.scheduler.scheduler, optimizer=optimizer + ) + scheduler_config = {"scheduler": lr_scheduler} + scheduler_config.update(self.config.scheduler.pytorch_lightning_params) + return [optimizer], [scheduler_config] + + def prepare_data(self): + self.train_dataset = hydra.utils.instantiate( + self.config.data.train_dataset + ) + self.validation_dataset = hydra.utils.instantiate( + self.config.data.validation_dataset + ) + self.test_dataset = hydra.utils.instantiate( + self.config.data.test_dataset + ) + self.labels_info = self.train_dataset.label_info + + def train_dataloader(self): + c_fn = hydra.utils.instantiate(self.config.data.train_collation) + return hydra.utils.instantiate( + self.config.data.train_dataloader, + self.train_dataset, + collate_fn=c_fn, + ) + + def val_dataloader(self): + c_fn = hydra.utils.instantiate(self.config.data.validation_collation) + return hydra.utils.instantiate( + self.config.data.validation_dataloader, + self.validation_dataset, + collate_fn=c_fn, + ) + + def test_dataloader(self): + c_fn = hydra.utils.instantiate(self.config.data.test_collation) + return hydra.utils.instantiate( + self.config.data.test_dataloader, + self.test_dataset, + collate_fn=c_fn, + ) diff --git a/models/Mask3D/build/lib/mask3d/utils/__init__.py b/models/Mask3D/build/lib/mask3d/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/build/lib/mask3d/utils/gradflow_check.py b/models/Mask3D/build/lib/mask3d/utils/gradflow_check.py new file mode 100644 index 0000000000000000000000000000000000000000..2fedc91592d66d4e5bdef7531daafccc5b5f2e81 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/utils/gradflow_check.py @@ -0,0 +1,62 @@ +""" https://github.com/alwynmathew/gradflow-check """ +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.lines import Line2D + + +def plot_grad_flow(named_parameters): + ave_grads = [] + layers = [] + for n, p in named_parameters: + if (p.requires_grad) and ("bias" not in n): + if p.grad: + layers.append(n) + ave_grads.append(p.grad.abs().mean()) + else: + print(f"{n} - doesn't have gradient computed") + + plt.plot(ave_grads, alpha=0.3, color="b") + plt.hlines(0, 0, len(ave_grads) + 1, linewidth=1, color="k") + plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical") + plt.xlim(xmin=0, xmax=len(ave_grads)) + plt.xlabel("Layers") + plt.ylabel("average gradient") + plt.title("Gradient flow") + plt.grid(True) + + +def plot_grad_flow_v2(named_parameters): + """Plots the gradients flowing through different layers in the net during training. + Can be used for checking for possible gradient vanishing / exploding problems. + + Usage: Plug this function in Trainer class after loss.backwards() as + "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow""" + ave_grads = [] + max_grads = [] + layers = [] + for n, p in named_parameters: + if (p.requires_grad) and ("bias" not in n): + layers.append(n) + if p.grad: + ave_grads.append(p.grad.abs().mean()) + max_grads.append(p.grad.abs().max()) + else: + print(f"{n} - doesn't have gradient computed") + plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c") + plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b") + plt.hlines(0, 0, len(ave_grads) + 1, lw=2, color="k") + plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical") + plt.xlim(left=0, right=len(ave_grads)) + plt.ylim(bottom=-0.001, top=0.02) # zoom in on the lower gradient regions + plt.xlabel("Layers") + plt.ylabel("average gradient") + plt.title("Gradient flow") + plt.grid(True) + plt.legend( + [ + Line2D([0], [0], color="c", lw=4), + Line2D([0], [0], color="b", lw=4), + Line2D([0], [0], color="k", lw=4), + ], + ["max-gradient", "mean-gradient", "zero-gradient"], + ) diff --git a/models/Mask3D/build/lib/mask3d/utils/kfold.py b/models/Mask3D/build/lib/mask3d/utils/kfold.py new file mode 100644 index 0000000000000000000000000000000000000000..5bfeba130c890eec35530adeb23f1362041f7cdc --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/utils/kfold.py @@ -0,0 +1,89 @@ +""" Author: https://github.com/yk-szk/stratified_group_kfold """ +import random +import numpy as np + + +class StratifiedGroupKFold: + """ + Stratified Group K-fold with sklearn.model_selection.KFold compabitility. + + Split dataset into k folds with balanced label distribution (stratified) and non-overlapping group. + + Args: + n_splits (int): # of splits + shuffle (bool): Shuffle + seed (int): Seed value for random number generator + """ + + def __init__(self, n_splits, shuffle=True, random_state=None): + self.n_splits = n_splits + self.shuffle = shuffle + self.seed = random_state + + def split(self, X, labels, groups): + assert len(X) == len(labels) == len(groups), "Invalid input length" + assert ( + len(set(groups)) >= self.n_splits + ), "The number of groups needs to be larger than n_splits" + + def encode(v): + s = set(v) + d = {l: i for i, l in enumerate(s)} + return [d[e] for e in v] + + labels, groups = encode(labels), encode(groups) + num_labels, num_groups = max(labels) + 1, max(groups) + 1 + label_counts_per_group = np.zeros((num_groups, num_labels), dtype=int) + global_label_dist = np.bincount(labels) + for label, g in zip(labels, groups): + label_counts_per_group[g][label] += 1 + + label_counts_per_fold = np.zeros( + (self.n_splits, num_labels), dtype=int + ) + groups_per_fold = [set() for _ in range(self.n_splits)] + + def eval_label_counts_per_fold(y_counts, fold): + fold += y_counts + std_per_label = ( + np.std(label_counts_per_fold, axis=0) / global_label_dist + ) + fold -= y_counts + return np.mean(std_per_label) + + groups_and_label_counts = list(enumerate(label_counts_per_group)) + if self.shuffle: + rng = random.Random(self.seed) + mean_std = np.mean(np.std(label_counts_per_group, axis=1)) + groups_and_label_counts.sort( + key=lambda g_counts: -np.std(g_counts[1]) + + rng.gauss(0, mean_std) + ) # add rng.gauss to increase the randomness + else: + groups_and_label_counts.sort( + key=lambda g_counts: -np.std(g_counts[1]) + ) + + for g, label_counts in groups_and_label_counts: + evals = [ + eval_label_counts_per_fold( + label_counts, label_counts_per_fold[i] + ) + for i in range(self.n_splits) + ] + best_fold = np.argmin(evals) + label_counts_per_fold[best_fold] += label_counts + groups_per_fold[best_fold].add(g) + + all_groups = set(groups) + for test_groups in groups_per_fold: + train_groups = all_groups - test_groups + + train_indices = [ + i for i, g in enumerate(groups) if g in train_groups + ] + test_indices = [ + i for i, g in enumerate(groups) if g in test_groups + ] + + yield train_indices, test_indices diff --git a/models/Mask3D/build/lib/mask3d/utils/pc_visualizations.py b/models/Mask3D/build/lib/mask3d/utils/pc_visualizations.py new file mode 100644 index 0000000000000000000000000000000000000000..26937b9f293f9cc2b87cc67d3c8742c80f770d60 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/utils/pc_visualizations.py @@ -0,0 +1,202 @@ +from io import BytesIO +from imageio import imread + +import open3d as o3d +from PIL import Image +import numpy as np +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from pandas import DataFrame +import matplotlib +import seaborn as sns +import pyviz3d.visualizer as viz + +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +def point_cloud_plolty( + coordinates, + label_color, + label_text, + prediction_color, + prediction_text, + normals, +): + def draw_point_cloud(coords, colors=None, label_text=None): + marker = dict(size=1, opacity=0.8) + if colors is not None: + marker.update({"color": colors}) + if (colors is None) and (label_text is not None): + marker.update({"color": label_text}) + fig = go.Scatter3d( + x=coords[:, 0], + y=coords[:, 1], + z=coords[:, 2], + text=label_text, + mode="markers", + marker=marker, + ) + return fig + + fig = make_subplots( + rows=1, + cols=2, + specs=[[{"type": "scatter3d"}, {"type": "scatter3d"}]], + ) + fig.add_trace( + draw_point_cloud(coordinates, prediction_color, prediction_text), + row=1, + col=1, + ) + # adding image with prediction + fig.add_trace( + draw_point_cloud(coordinates, label_color, label_text), row=1, col=2 + ) + fig.show() + # data = fig.to_image(width=1080, height=720, format="png") + # image = Image.open(BytesIO(data)) + # return image + + +def point_cloud_pyviz3d( + name, + coordinates, + path, + color=None, + normals=None, + label_color=None, + prediction_color=None, + point_size=25, + voxel_size=0.01, +): + + # because of visualization + coordinates = coordinates * voxel_size + # First, we set up a visualizer + visualizer = viz.Visualizer() + if label_color is not None: + visualizer.add_points( + name=f"{name}_label", + positions=coordinates, + colors=label_color, + point_size=point_size, + visible=False, + ) + + if prediction_color is not None: + visualizer.add_points( + name=f"{name}_prediction", + positions=coordinates, + colors=prediction_color, + point_size=point_size, + visible=False, + ) + + visualizer.add_points( + name=name, + positions=coordinates, + colors=color, + normals=normals, + point_size=point_size, + visible=False, + ) + # When we added everything we need to the visualizer, we save it. + visualizer.save(path, verbose=False) + + +def point_cloud_open3d(coordinates): + points = o3d.geometry.PointCloud(o3d.utility.Vector3dVector(coordinates)) + o3d.visualization.draw_geometries([points]) + + +def _remap_model_output(output, labels): + output = np.array(output) + output_remapped = output.copy() + for i, k in enumerate(labels.keys()): + output_remapped[output == i] = k + return output_remapped + + +def save_visualization( + coordinates, + name="none", + color=None, + normals=None, + target=None, + prediction=None, + target_info=None, + path="./saved", + backend="pyviz3d", + voxel_size=0.05, + color_mean=[0.47793125906962, 0.4303257521323044, 0.3749598901421883], + color_std=[0.2834475483823543, 0.27566157565723015, 0.27018971370874995], +): + target = _remap_model_output(target, target_info) + prediction = _remap_model_output(prediction, target_info) + coordinates = coordinates[:, :3] - coordinates[:, :3].mean(axis=0) + coordinates = coordinates * voxel_size + if color is not None: + color = (color * color_std + color_mean) * 255 + + target_color = np.zeros((len(target), 3)) + target_text = np.full((len(target)), "empty") + prediction_color = np.zeros((len(prediction), 3)) + prediction_text = np.full((len(prediction)), "empty") + if target_info is not None: + for k, v in target_info.items(): + target_color[target == k] = v["color"] + target_text[target == k] = v["name"] + prediction_color[prediction == k] = v["color"] + prediction_text[prediction == k] = v["name"] + if backend == "pyviz3d": + point_cloud_pyviz3d( + name=name, + coordinates=coordinates, + path=path, + color=color, + normals=normals, + label_color=target_color, + prediction_color=prediction_color, + voxel_size=1, + ) + elif backend == "plotly": + point_cloud_plolty( + coordinates=coordinates, + normals=normals, + label_color=target_color, + label_text=target_text, + prediction_color=prediction_color, + prediction_text=prediction_text, + ) + elif backend == "open3d": + point_cloud_open3d(coordinates) + else: + print("No such backend") + + +def draw_confsion_matrix(confusion_matrix, label_db): + index = [i for i in range(confusion_matrix.shape[0])] + index = _remap_model_output(index, label_db) + column_names = np.full((len(index)), "empty") + for k, v in label_db.items(): + column_names[index == k] = v["name"] + df_cm = DataFrame( + confusion_matrix, index=column_names, columns=column_names + ) + # pretty_plot_confusion_matrix(df_cm, fz=9) + sns.heatmap( + df_cm, + annot=True, + fmt="d", + linewidths=0.25, + annot_kws={"size": 5}, + vmax=10000, + ) + buf = BytesIO() + plt.savefig(buf, format="jpg") + plt.close() + buf.seek(0) + image = imread(buf, format="jpg") + buf.close() + return image diff --git a/models/Mask3D/build/lib/mask3d/utils/point_cloud_utils.py b/models/Mask3D/build/lib/mask3d/utils/point_cloud_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7d2b5ec875da78d299c23afa70531cb0df04e278 --- /dev/null +++ b/models/Mask3D/build/lib/mask3d/utils/point_cloud_utils.py @@ -0,0 +1,83 @@ +from pathlib import Path +from typing import List, Optional, Tuple + +import numpy as np +import open3d +from plyfile import PlyData, PlyElement + + +def load_ply(filepath): + with open(filepath, "rb") as f: + plydata = PlyData.read(f) + data = plydata.elements[0].data + coords = np.array([data["x"], data["y"], data["z"]], dtype=np.float32).T + feats = None + labels = None + if ({"red", "green", "blue"} - set(data.dtype.names)) == set(): + feats = np.array( + [data["red"], data["green"], data["blue"]], dtype=np.uint8 + ).T + if "label" in data.dtype.names: + labels = np.array(data["label"], dtype=np.uint32) + return coords, feats, labels + + +def load_ply_with_normals(filepath): + mesh = open3d.io.read_triangle_mesh(str(filepath)) + if not mesh.has_vertex_normals(): + mesh.compute_vertex_normals() + vertices = np.asarray(mesh.vertices) + normals = np.asarray(mesh.vertex_normals) + + coords, feats, labels = load_ply(filepath) + assert np.allclose(coords, vertices), "different coordinates" + feats = np.hstack((feats, normals)) + + return coords, feats, labels + + +def load_obj_with_normals(filepath): + mesh = open3d.io.read_triangle_mesh(str(filepath)) + if not mesh.has_vertex_normals(): + mesh.compute_vertex_normals() + coords = np.asarray(mesh.vertices) + normals = np.asarray(mesh.vertex_normals) + colors = np.asarray(mesh.vertex_colors) + feats = np.hstack((colors, normals)) + + return coords, feats + + +def write_point_cloud_in_ply( + filepath: Path, + coords: np.ndarray, + feats: Optional[np.ndarray] = None, + labels: Optional[np.ndarray] = None, + dtypes: Optional[List[Tuple[str, str]]] = [ + ("x", "= (3, 8): + from collections.abc import MutableMapping +else: + from collections import MutableMapping + +import torch +from loguru import logger + + +def flatten_dict(d, parent_key="", sep="_"): + """ + https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys + """ + items = [] + for k, v in d.items(): + new_key = parent_key + sep + k if parent_key else k + if isinstance(v, MutableMapping): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + +def load_baseline_model(cfg, model): + # if it is Minkoski weights + cfg.model.in_channels = 3 + cfg.model.config.conv1_kernel_size = 5 + cfg.data.add_normals = False + cfg.data.train_dataset.color_mean_std = [(0.5, 0.5, 0.5), (1, 1, 1)] + cfg.data.validation_dataset.color_mean_std = [(0.5, 0.5, 0.5), (1, 1, 1)] + cfg.data.test_dataset.color_mean_std = [(0.5, 0.5, 0.5), (1, 1, 1)] + cfg.data.voxel_size = 0.02 + model = model(cfg) + state_dict = torch.load(cfg.general.checkpoint)["state_dict"] + model.model.load_state_dict(state_dict) + return cfg, model + + +def load_backbone_checkpoint_with_missing_or_exsessive_keys(cfg, model): + state_dict = torch.load(cfg.general.backbone_checkpoint)["state_dict"] + correct_dict = dict(model.state_dict()) + + # if parametrs not found in checkpoint they will be randomly initialized + for key in state_dict.keys(): + if correct_dict.pop(f"model.backbone.{key}", None) is None: + logger.warning( + f"Key not found, it will be initialized randomly: {key}" + ) + + # if parametrs have different shape, it will randomly initialize + state_dict = torch.load(cfg.general.backbone_checkpoint)["state_dict"] + correct_dict = dict(model.state_dict()) + for key in correct_dict.keys(): + if key.replace("model.backbone.", "") not in state_dict: + logger.warning(f"{key} not in loaded checkpoint") + state_dict.update( + {key.replace("model.backbone.", ""): correct_dict[key]} + ) + elif ( + state_dict[key.replace("model.backbone.", "")].shape + != correct_dict[key].shape + ): + logger.warning( + f"incorrect shape {key}:{state_dict[key.replace('model.backbone.', '')].shape} vs {correct_dict[key].shape}" + ) + state_dict.update({key: correct_dict[key]}) + + # if we have more keys just discard them + correct_dict = dict(model.state_dict()) + new_state_dict = dict() + for key in state_dict.keys(): + if f"model.backbone.{key}" in correct_dict.keys(): + new_state_dict.update({f"model.backbone.{key}": state_dict[key]}) + elif key in correct_dict.keys(): + new_state_dict.update({key: correct_dict[key]}) + else: + logger.warning(f"excessive key: {key}") + model.load_state_dict(new_state_dict) + return cfg, model + + +def load_checkpoint_with_missing_or_exsessive_keys(cfg, model): + state_dict = torch.load(cfg.general.checkpoint)["state_dict"] + correct_dict = dict(model.state_dict()) + + # if parametrs not found in checkpoint they will be randomly initialized + for key in state_dict.keys(): + if correct_dict.pop(key, None) is None: + logger.warning( + f"Key not found, it will be initialized randomly: {key}" + ) + + # if parametrs have different shape, it will randomly initialize + state_dict = torch.load(cfg.general.checkpoint)["state_dict"] + correct_dict = dict(model.state_dict()) + for key in correct_dict.keys(): + if key not in state_dict: + logger.warning(f"{key} not in loaded checkpoint") + state_dict.update({key: correct_dict[key]}) + elif state_dict[key].shape != correct_dict[key].shape: + logger.warning( + f"incorrect shape {key}:{state_dict[key].shape} vs {correct_dict[key].shape}" + ) + state_dict.update({key: correct_dict[key]}) + + # if we have more keys just discard them + correct_dict = dict(model.state_dict()) + new_state_dict = dict() + for key in state_dict.keys(): + if key in correct_dict.keys(): + new_state_dict.update({key: state_dict[key]}) + else: + logger.warning(f"excessive key: {key}") + model.load_state_dict(new_state_dict) + return cfg, model + + +def freeze_until(net, param_name: str = None): + """ + Freeze net until param_name + https://opendatascience.slack.com/archives/CGK4KQBHD/p1588373239292300?thread_ts=1588105223.275700&cid=CGK4KQBHD + Args: + net: + param_name: + Returns: + """ + found_name = False + for name, params in net.named_parameters(): + if name == param_name: + found_name = True + params.requires_grad = found_name diff --git a/models/Mask3D/mask3d.egg-info/PKG-INFO b/models/Mask3D/mask3d.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..8bc09c1d9f7d373a6ae88b90feddb4097f838333 --- /dev/null +++ b/models/Mask3D/mask3d.egg-info/PKG-INFO @@ -0,0 +1,11 @@ +Metadata-Version: 2.1 +Name: mask3d +Version: 0.1 +Summary: UNKNOWN +Home-page: UNKNOWN +License: UNKNOWN +Platform: UNKNOWN +License-File: LICENSE + +UNKNOWN + diff --git a/models/Mask3D/mask3d.egg-info/SOURCES.txt b/models/Mask3D/mask3d.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8664a91f3fa541efb77e4f4bb3dd0dde5aadf2d --- /dev/null +++ b/models/Mask3D/mask3d.egg-info/SOURCES.txt @@ -0,0 +1,110 @@ +LICENSE +MANIFEST.in +README.md +setup.py +mask3d/__init__.py +mask3d/main_instance_segmentation.py +mask3d/predict.py +mask3d/preprocess_arkitscenes.py +mask3d.egg-info/PKG-INFO +mask3d.egg-info/SOURCES.txt +mask3d.egg-info/dependency_links.txt +mask3d.egg-info/top_level.txt +mask3d/benchmark/__init__.py +mask3d/benchmark/evaluate_semantic_instance.py +mask3d/benchmark/util.py +mask3d/benchmark/util_3d.py +mask3d/conf/__init__.py +mask3d/conf/config_base_instance_segmentation.yaml +mask3d/conf/augmentation/albumentations_aug.yaml +mask3d/conf/augmentation/volumentations_aug.yaml +mask3d/conf/callbacks/callbacks_instance_segmentation.yaml +mask3d/conf/data/indoor.yaml +mask3d/conf/data/outdoor.yaml +mask3d/conf/data/collation_functions/voxelize_collate.yaml +mask3d/conf/data/collation_functions/voxelize_collate_merge.yaml +mask3d/conf/data/data_loaders/simple_loader.yaml +mask3d/conf/data/data_loaders/simple_loader_save_memory.yaml +mask3d/conf/data/datasets/matterport.yaml +mask3d/conf/data/datasets/matterport_scannet.yaml +mask3d/conf/data/datasets/rio.yaml +mask3d/conf/data/datasets/s3dis.yaml +mask3d/conf/data/datasets/scannet.yaml +mask3d/conf/data/datasets/scannet200.yaml +mask3d/conf/data/datasets/semantic_kitti.yaml +mask3d/conf/data/datasets/stpls3d.yaml +mask3d/conf/logging/base.yaml +mask3d/conf/logging/full.yaml +mask3d/conf/logging/minimal.yaml +mask3d/conf/logging/offline.yaml +mask3d/conf/loss/cross_entropy.yaml +mask3d/conf/loss/set_criterion.yaml +mask3d/conf/loss/set_criterion_custom_weights_1.yaml +mask3d/conf/matcher/hungarian_matcher.yaml +mask3d/conf/metrics/miou.yaml +mask3d/conf/model/mask3d.yaml +mask3d/conf/optimizer/adamw.yaml +mask3d/conf/optimizer/adamw_lower.yaml +mask3d/conf/scheduler/exponentiallr.yaml +mask3d/conf/scheduler/lambdalr.yaml +mask3d/conf/scheduler/onecyclelr.yaml +mask3d/conf/trainer/trainer.yaml +mask3d/conf/trainer/trainer600.yaml +mask3d/datasets/__init__.py +mask3d/datasets/outdoor_semseg.py +mask3d/datasets/random_cuboid.py +mask3d/datasets/semseg.py +mask3d/datasets/utils.py +mask3d/datasets/preprocessing/__init__.py +mask3d/datasets/preprocessing/arkitscenes_preprocessing.py +mask3d/datasets/preprocessing/base_preprocessing.py +mask3d/datasets/preprocessing/s3dis_preprocessing.py +mask3d/datasets/preprocessing/scannet_preprocessing.py +mask3d/datasets/preprocessing/semantic_kitti_preprocessing.py +mask3d/datasets/preprocessing/stpls3d_preprocessing.py +mask3d/datasets/scannet200/__init__.py +mask3d/datasets/scannet200/scannet200_constants.py +mask3d/datasets/scannet200/scannet200_splits.py +mask3d/models/__init__.py +mask3d/models/criterion.py +mask3d/models/mask3d.py +mask3d/models/matcher.py +mask3d/models/misc.py +mask3d/models/model.py +mask3d/models/position_embedding.py +mask3d/models/res16unet.py +mask3d/models/resnet.py +mask3d/models/resunet.py +mask3d/models/wrapper.py +mask3d/models/metrics/__init__.py +mask3d/models/metrics/confusionmatrix.py +mask3d/models/metrics/metrics.py +mask3d/models/modules/3detr_helpers.py +mask3d/models/modules/__init__.py +mask3d/models/modules/common.py +mask3d/models/modules/helpers_3detr.py +mask3d/models/modules/resnet_block.py +mask3d/models/modules/senet_block.py +mask3d/trainer/__init__.py +mask3d/trainer/trainer.py +mask3d/utils/__init__.py +mask3d/utils/gradflow_check.py +mask3d/utils/kfold.py +mask3d/utils/pc_visualizations.py +mask3d/utils/point_cloud_utils.py +mask3d/utils/utils.py +mask3d/utils/pointops2/__init__.py +mask3d/utils/pointops2/setup.py +mask3d/utils/pointops2/functions/__init__.py +mask3d/utils/pointops2/functions/pointops.py +mask3d/utils/pointops2/functions/pointops2.py +mask3d/utils/pointops2/functions/pointops_ablation.py +mask3d/utils/pointops2/functions/test_attention_op_step1.py +mask3d/utils/pointops2/functions/test_attention_op_step1_v2.py +mask3d/utils/pointops2/functions/test_attention_op_step2.py +mask3d/utils/pointops2/functions/test_relative_pos_encoding_op_step1.py +mask3d/utils/pointops2/functions/test_relative_pos_encoding_op_step1_v2.py +mask3d/utils/pointops2/functions/test_relative_pos_encoding_op_step1_v3.py +mask3d/utils/pointops2/functions/test_relative_pos_encoding_op_step2.py +mask3d/utils/pointops2/functions/test_relative_pos_encoding_op_step2_v2.py +mask3d/utils/pointops2/src/__init__.py \ No newline at end of file diff --git a/models/Mask3D/mask3d.egg-info/dependency_links.txt b/models/Mask3D/mask3d.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/models/Mask3D/mask3d.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/models/Mask3D/mask3d.egg-info/top_level.txt b/models/Mask3D/mask3d.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..347620dbc6cab3f22ef5e880a7f4ff468f301c49 --- /dev/null +++ b/models/Mask3D/mask3d.egg-info/top_level.txt @@ -0,0 +1 @@ +mask3d diff --git a/models/Mask3D/mask3d/__init__.py b/models/Mask3D/mask3d/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e2b6e21d418d6ee195db3d2b8682476c8fb448cd --- /dev/null +++ b/models/Mask3D/mask3d/__init__.py @@ -0,0 +1,276 @@ +import hydra +import torch +from torch_scatter import scatter_mean + +from mask3d.models.mask3d import Mask3D +from mask3d.utils.utils import ( + load_checkpoint_with_missing_or_exsessive_keys, + load_backbone_checkpoint_with_missing_or_exsessive_keys, +) + +class InstanceSegmentation(torch.nn.Module): + def __init__(self, cfg): + super().__init__() + self.model = hydra.utils.instantiate(cfg.model) + + + def forward(self, x, raw_coordinates=None, point2segment=None): + return self.model(x, raw_coordinates=raw_coordinates, point2segment=point2segment) + + +from omegaconf import OmegaConf, DictConfig +import hydra +from hydra.core.global_hydra import GlobalHydra +from hydra.experimental import initialize, compose + +# imports for input loading +import albumentations as A +import MinkowskiEngine as ME +import numpy as np +import open3d as o3d + +# imports for output +from mask3d.datasets.scannet200.scannet200_constants import (VALID_CLASS_IDS_20, VALID_CLASS_IDS_200, SCANNET_COLOR_MAP_20, SCANNET_COLOR_MAP_200) + +def get_model(checkpoint_path=None, dataset_name = "scannet200"): + + + # Initialize the directory with config files + with initialize(config_path="conf"): + # Compose a configuration + cfg = compose(config_name="config_base_instance_segmentation.yaml") + + cfg.general.checkpoint = checkpoint_path + + # would be nicd to avoid this hardcoding below + # dataset_name = checkpoint_path.split('/')[-1].split('_')[0] + if dataset_name == 'scannet200': + cfg.general.num_targets = 201 + cfg.general.train_mode = False + cfg.general.eval_on_segments = True + cfg.general.topk_per_image = 300 + cfg.general.use_dbscan = True + cfg.general.dbscan_eps = 0.95 + cfg.general.export_threshold = 0.001 + + # # data + cfg.data.num_labels = 200 + cfg.data.test_mode = "test" + + # # model + cfg.model.num_queries = 150 + + if dataset_name == 'scannet': + cfg.general.num_targets = 19 + cfg.general.train_mode = False + cfg.general.eval_on_segments = True + cfg.general.topk_per_image = 300 + cfg.general.use_dbscan = True + cfg.general.dbscan_eps = 0.95 + cfg.general.export_threshold = 0.001 + + # # data + cfg.data.num_labels = 20 + cfg.data.test_mode = "test" + + # # model + cfg.model.num_queries = 150 + + #TODO: this has to be fixed and discussed with Jonas + # cfg.model.scene_min = -3. + # cfg.model.scene_max = 3. + + # # Initialize the Hydra context + # hydra.core.global_hydra.GlobalHydra.instance().clear() + # hydra.initialize(config_path="conf") + + # Load the configuration + # cfg = hydra.compose(config_name="config_base_instance_segmentation.yaml") + model = InstanceSegmentation(cfg) + + if cfg.general.backbone_checkpoint is not None: + cfg, model = load_backbone_checkpoint_with_missing_or_exsessive_keys( + cfg, model + ) + if cfg.general.checkpoint is not None: + cfg, model = load_checkpoint_with_missing_or_exsessive_keys(cfg, model) + + return model + + +def load_mesh(pcl_file): + + # load point cloud + input_mesh_path = pcl_file + mesh = o3d.io.read_triangle_mesh(input_mesh_path) + return mesh + +def load_ply(path_2_mesh): + pcd = o3d.io.read_point_cloud(path_2_mesh) + return pcd + +def prepare_data(pointcloud_file, device): + # normalization for point cloud features + color_mean = (0.47793125906962, 0.4303257521323044, 0.3749598901421883) + color_std = (0.2834475483823543, 0.27566157565723015, 0.27018971370874995) + normalize_color = A.Normalize(mean=color_mean, std=color_std) + + datatype = None + + if pointcloud_file.split('.')[-1] == 'ply': + try: + mesh = load_mesh(pointcloud_file) + points = np.asarray(mesh.vertices) + colors = np.asarray(mesh.vertex_colors) + colors = colors * 255. + datatype = "mesh" + except: + pcd = load_ply(pointcloud_file) + points = np.asarray(pcd.points) + colors = np.asarray(pcd.colors) + datatype = "point cloud" + + if datatype is None: + print("DATA TYPE IS NOT SUPPORTED!") + exit() + segments = None + elif pointcloud_file.split('.')[-1] == 'npy': + points = np.load(pointcloud_file) + points, colors, normals, segments, labels = ( + points[:, :3], + points[:, 3:6], + points[:, 6:9], + points[:, 9], + points[:, 10:12], + ) + datatype = "mesh" + + else: + print("FORMAT NOT SUPPORTED") + exit() + if datatype == "mesh": + pseudo_image = colors.astype(np.uint8)[np.newaxis, :, :] + colors = np.squeeze(normalize_color(image=pseudo_image)["image"]) + + coords = np.floor(points / 0.02) + _, _, unique_map, inverse_map = ME.utils.sparse_quantize( + coordinates=coords, + features=colors, + return_index=True, + return_inverse=True, + ) + + sample_coordinates = coords[unique_map] + coordinates = [torch.from_numpy(sample_coordinates).int()] + sample_features = colors[unique_map] + features = [torch.from_numpy(sample_features).float()] + + if segments is not None: + point2segment_full = segments + point2segment = segments[unique_map] + point2segment = [torch.from_numpy(point2segment).long()] + point2segment_full = [torch.from_numpy(point2segment_full).long()] + + # Concatenate all lists + input_dict = {"coords": coordinates, "feats": features} + if len(point2segment) > 0: + input_dict["labels"] = point2segment + coordinates, _, point2segment = ME.utils.sparse_collate(**input_dict) + point2segment = point2segment.cuda() + else: + coordinates, _ = ME.utils.sparse_collate(**input_dict) + point2segment = None + point2segment_full = None + else: + point2segment = None + point2segment_full = None + coordinates, _ = ME.utils.sparse_collate(coords=coordinates, feats=features) + + features = torch.cat(features, dim=0) + data = ME.SparseTensor( + coordinates=coordinates, + features=features, + device=device, + ) + return data, points, colors, features, unique_map, inverse_map, point2segment, point2segment_full + + +def map_output_to_pointcloud(outputs, + inverse_map, + point2segment, + point2segment_full): + + # parse predictions + logits = outputs["pred_logits"] + logits = torch.functional.F.softmax(logits, dim=-1)[..., :-1] + masks = outputs["pred_masks"] + # reformat predictions + logits = logits[0] + masks = masks[0] if point2segment is None else masks[0][point2segment] + + num_queries = len(logits) + scores_per_query, topk_indices = logits.flatten(0, 1).topk( + num_queries, sorted=True + ) + + topk_indices = topk_indices // 200 + masks = masks[:, topk_indices] + + result_pred_mask = (masks > 0).float() + heatmap = masks.float().sigmoid() + + mask_scores_per_image = (heatmap * result_pred_mask).sum(0) / ( + result_pred_mask.sum(0) + 1e-6 + ) + score = scores_per_query * mask_scores_per_image + result_pred_mask = get_full_res_mask(result_pred_mask, inverse_map, point2segment_full[0]) if point2segment_full is not None else result_pred_mask[inverse_map] + return (result_pred_mask, score) + +def get_full_res_mask(mask, inverse_map, point2segment_full): + mask = mask.detach().cpu()[inverse_map] # full res + mask = scatter_mean(mask, point2segment_full, dim=0) # full res segments + mask = (mask > 0.5).float() + mask = mask.detach().cpu()[point2segment_full.cpu()] # full res points + return mask + +def save_colorized_mesh(mesh, labels_mapped, output_file, colormap='scannet'): + + # colorize mesh + colors = np.zeros((len(mesh.vertices), 3)) + for li in np.unique(labels_mapped): + if colormap == 'scannet': + raise ValueError('Not implemented yet') + elif colormap == 'scannet200': + v_li = VALID_CLASS_IDS_200[int(li)] + colors[(labels_mapped == li)[:, 0], :] = SCANNET_COLOR_MAP_200[v_li] + else: + raise ValueError('Unknown colormap - not supported') + + colors = colors / 255. + mesh.vertex_colors = o3d.utility.Vector3dVector(colors) + o3d.io.write_triangle_mesh(output_file, mesh) + +if __name__ == '__main__': + + model = get_model('checkpoints/scannet200/scannet200_benchmark.ckpt') + model.eval() + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.to(device) + + # load input data + pointcloud_file = 'data/pcl.ply' + mesh = load_mesh(pointcloud_file) + + # prepare data + data, points, colors, features, unique_map, inverse_map = prepare_data(mesh, device) + + # run model + with torch.no_grad(): + outputs = model(data, raw_coordinates=features) + + # map output to point cloud + labels = map_output_to_pointcloud(mesh, outputs, inverse_map) + + # save colorized mesh + save_colorized_mesh(mesh, labels, 'data/pcl_labelled.ply', colormap='scannet200') + \ No newline at end of file diff --git a/models/Mask3D/mask3d/benchmark/__init__.py b/models/Mask3D/mask3d/benchmark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/mask3d/benchmark/evaluate_semantic_instance.py b/models/Mask3D/mask3d/benchmark/evaluate_semantic_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..242cb87a09b5c69a0d967217a2cd97706197a63d --- /dev/null +++ b/models/Mask3D/mask3d/benchmark/evaluate_semantic_instance.py @@ -0,0 +1,1141 @@ +# Evaluates semantic instance task +# Adapted from the CityScapes evaluation: https://github.com/mcordts/cityscapesScripts/tree/master/cityscapesscripts/evaluation +# Input: +# - path to .txt prediction files +# - path to .txt ground truth files +# - output file to write results to +# Each .txt prediction file look like: +# [(pred0) rel. path to pred. mask over verts as .txt] [(pred0) label id] [(pred0) confidence] +# [(pred1) rel. path to pred. mask over verts as .txt] [(pred1) label id] [(pred1) confidence] +# [(pred2) rel. path to pred. mask over verts as .txt] [(pred2) label id] [(pred2) confidence] +# ... +# +# NOTE: The prediction files must live in the root of the given prediction path. +# Predicted mask .txt files must live in a subfolder. +# Additionally, filenames must not contain spaces. +# The relative paths to predicted masks must contain one integer per line, +# where each line corresponds to vertices in the *_vh_clean_2.ply (in that order). +# Non-zero integers indicate part of the predicted instance. +# The label ids specify the class of the corresponding mask. +# Confidence is a float confidence score of the mask. +# +# Note that only the valid classes are used for evaluation, +# i.e., any ground truth label not in the valid label set +# is ignored in the evaluation. +# +# example usage: evaluate_semantic_instance.py --scan_path [path to scan data] --output_file [output file] + +# python imports +import math +import os, sys, argparse +import inspect +from copy import deepcopy +from uuid import uuid4 + +import torch + +try: + import numpy as np +except: + print("Failed to import numpy package.") + sys.exit(-1) + +from scipy import stats + +# currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +# parentdir = os.path.dirname(currentdir) +# sys.path.insert(0,parentdir) +import benchmark.util as util +import benchmark.util_3d as util_3d + +# parser = argparse.ArgumentParser() +# parser.add_argument('--gt_path', default='', help='path to directory of gt .txt files') +# parser.add_argument('--output_file', default='', help='output file [default: ./semantic_instance_evaluation.txt]') +# opt = parser.parse_args() + +# if opt.output_file == '': +# opt.output_file = os.path.join(os.getcwd(), 'semantic_instance_evaluation.txt') + + +# ---------- Label info ---------- # +CLASS_LABELS = [ + "cabinet", + "bed", + "chair", + "sofa", + "table", + "door", + "window", + "bookshelf", + "picture", + "counter", + "desk", + "curtain", + "refrigerator", + "shower curtain", + "toilet", + "sink", + "bathtub", + "otherfurniture", +] +VALID_CLASS_IDS = np.array( + [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39] +) +ID_TO_LABEL = {} +LABEL_TO_ID = {} +for i in range(len(VALID_CLASS_IDS)): + LABEL_TO_ID[CLASS_LABELS[i]] = VALID_CLASS_IDS[i] + ID_TO_LABEL[VALID_CLASS_IDS[i]] = CLASS_LABELS[i] +# ---------- Evaluation params ---------- # +# overlaps for evaluation +opt = {} +opt["overlaps"] = np.append(np.arange(0.5, 0.95, 0.05), 0.25) +# minimum region size for evaluation [verts] +opt["min_region_sizes"] = np.array([100]) # 100 for s3dis, scannet +# distance thresholds [m] +opt["distance_threshes"] = np.array([float("inf")]) +# distance confidences +opt["distance_confs"] = np.array([-float("inf")]) + + +def evaluate_matches(matches): + overlaps = opt["overlaps"] + min_region_sizes = [opt["min_region_sizes"][0]] + dist_threshes = [opt["distance_threshes"][0]] + dist_confs = [opt["distance_confs"][0]] + + # results: class x overlap + ap = np.zeros( + (len(dist_threshes), len(CLASS_LABELS), len(overlaps)), float + ) + for di, (min_region_size, distance_thresh, distance_conf) in enumerate( + zip(min_region_sizes, dist_threshes, dist_confs) + ): + for oi, overlap_th in enumerate(overlaps): + pred_visited = {} + for m in matches: + for p in matches[m]["pred"]: + for label_name in CLASS_LABELS: + for p in matches[m]["pred"][label_name]: + if "uuid" in p: + pred_visited[p["uuid"]] = False + for li, label_name in enumerate(CLASS_LABELS): + y_true = np.empty(0) + y_score = np.empty(0) + hard_false_negatives = 0 + has_gt = False + has_pred = False + for m in matches: + pred_instances = matches[m]["pred"][label_name] + gt_instances = matches[m]["gt"][label_name] + # filter groups in ground truth + gt_instances = [ + gt + for gt in gt_instances + if gt["instance_id"] >= 1000 + and gt["vert_count"] >= min_region_size + and gt["med_dist"] <= distance_thresh + and gt["dist_conf"] >= distance_conf + ] + if gt_instances: + has_gt = True + if pred_instances: + has_pred = True + + cur_true = np.ones(len(gt_instances)) + cur_score = np.ones(len(gt_instances)) * (-float("inf")) + cur_match = np.zeros(len(gt_instances), dtype=bool) + # collect matches + for (gti, gt) in enumerate(gt_instances): + found_match = False + num_pred = len(gt["matched_pred"]) + for pred in gt["matched_pred"]: + # greedy assignments + if pred_visited[pred["uuid"]]: + continue + overlap = float(pred["intersection"]) / ( + gt["vert_count"] + + pred["vert_count"] + - pred["intersection"] + ) + if overlap > overlap_th: + confidence = pred["confidence"] + # if already have a prediction for this gt, + # the prediction with the lower score is automatically a false positive + if cur_match[gti]: + max_score = max(cur_score[gti], confidence) + min_score = min(cur_score[gti], confidence) + cur_score[gti] = max_score + # append false positive + cur_true = np.append(cur_true, 0) + cur_score = np.append(cur_score, min_score) + cur_match = np.append(cur_match, True) + # otherwise set score + else: + found_match = True + cur_match[gti] = True + cur_score[gti] = confidence + pred_visited[pred["uuid"]] = True + if not found_match: + hard_false_negatives += 1 + # remove non-matched ground truth instances + cur_true = cur_true[cur_match == True] + cur_score = cur_score[cur_match == True] + + # collect non-matched predictions as false positive + for pred in pred_instances: + found_gt = False + for gt in pred["matched_gt"]: + overlap = float(gt["intersection"]) / ( + gt["vert_count"] + + pred["vert_count"] + - gt["intersection"] + ) + if overlap > overlap_th: + found_gt = True + break + if not found_gt: + num_ignore = pred["void_intersection"] + for gt in pred["matched_gt"]: + # group? + if gt["instance_id"] < 1000: + num_ignore += gt["intersection"] + # small ground truth instances + if ( + gt["vert_count"] < min_region_size + or gt["med_dist"] > distance_thresh + or gt["dist_conf"] < distance_conf + ): + num_ignore += gt["intersection"] + proportion_ignore = ( + float(num_ignore) / pred["vert_count"] + ) + # if not ignored append false positive + if proportion_ignore <= overlap_th: + cur_true = np.append(cur_true, 0) + confidence = pred["confidence"] + cur_score = np.append(cur_score, confidence) + + # append to overall results + y_true = np.append(y_true, cur_true) + y_score = np.append(y_score, cur_score) + + # compute average precision + if has_gt and has_pred: + # compute precision recall curve first + + # sorting and cumsum + score_arg_sort = np.argsort(y_score) + y_score_sorted = y_score[score_arg_sort] + y_true_sorted = y_true[score_arg_sort] + y_true_sorted_cumsum = np.cumsum(y_true_sorted) + + # unique thresholds + (thresholds, unique_indices) = np.unique( + y_score_sorted, return_index=True + ) + num_prec_recall = len(unique_indices) + 1 + + # prepare precision recall + num_examples = len(y_score_sorted) + # https://github.com/ScanNet/ScanNet/pull/26 + # all predictions are non-matched but also all of them are ignored and not counted as FP + # y_true_sorted_cumsum is empty + # num_true_examples = y_true_sorted_cumsum[-1] + num_true_examples = ( + y_true_sorted_cumsum[-1] + if len(y_true_sorted_cumsum) > 0 + else 0 + ) + precision = np.zeros(num_prec_recall) + recall = np.zeros(num_prec_recall) + + # deal with the first point + y_true_sorted_cumsum = np.append(y_true_sorted_cumsum, 0) + # deal with remaining + for idx_res, idx_scores in enumerate(unique_indices): + cumsum = y_true_sorted_cumsum[idx_scores - 1] + tp = num_true_examples - cumsum + fp = num_examples - idx_scores - tp + fn = cumsum + hard_false_negatives + p = float(tp) / (tp + fp) + r = float(tp) / (tp + fn) + precision[idx_res] = p + recall[idx_res] = r + + # first point in curve is artificial + precision[-1] = 1.0 + recall[-1] = 0.0 + + # compute average of precision-recall curve + recall_for_conv = np.copy(recall) + recall_for_conv = np.append( + recall_for_conv[0], recall_for_conv + ) + recall_for_conv = np.append(recall_for_conv, 0.0) + + stepWidths = np.convolve( + recall_for_conv, [-0.5, 0, 0.5], "valid" + ) + # integrate is now simply a dot product + ap_current = np.dot(precision, stepWidths) + + elif has_gt: + ap_current = 0.0 + else: + ap_current = float("nan") + ap[di, li, oi] = ap_current + return ap + + +def compute_averages(aps): + d_inf = 0 + o50 = np.where(np.isclose(opt["overlaps"], 0.5)) + o25 = np.where(np.isclose(opt["overlaps"], 0.25)) + oAllBut25 = np.where(np.logical_not(np.isclose(opt["overlaps"], 0.25))) + avg_dict = {} + # avg_dict['all_ap'] = np.nanmean(aps[ d_inf,:,: ]) + avg_dict["all_ap"] = np.nanmean(aps[d_inf, :, oAllBut25]) + avg_dict["all_ap_50%"] = np.nanmean(aps[d_inf, :, o50]) + avg_dict["all_ap_25%"] = np.nanmean(aps[d_inf, :, o25]) + avg_dict["classes"] = {} + for (li, label_name) in enumerate(CLASS_LABELS): + avg_dict["classes"][label_name] = {} + # avg_dict["classes"][label_name]["ap"] = np.average(aps[ d_inf,li, :]) + avg_dict["classes"][label_name]["ap"] = np.average( + aps[d_inf, li, oAllBut25] + ) + avg_dict["classes"][label_name]["ap50%"] = np.average( + aps[d_inf, li, o50] + ) + avg_dict["classes"][label_name]["ap25%"] = np.average( + aps[d_inf, li, o25] + ) + return avg_dict + + +def make_pred_info(pred: dict): + # pred = {'pred_scores' = 100, 'pred_classes' = 100 'pred_masks' = Nx100} + pred_info = {} + assert ( + pred["pred_classes"].shape[0] + == pred["pred_scores"].shape[0] + == pred["pred_masks"].shape[1] + ) + for i in range(len(pred["pred_classes"])): + info = {} + info["label_id"] = pred["pred_classes"][i] + info["conf"] = pred["pred_scores"][i] + info["mask"] = pred["pred_masks"][:, i] + pred_info[uuid4()] = info # we later need to identify these objects + return pred_info + + +def assign_instances_for_scan(pred: dict, gt_file: str): + pred_info = make_pred_info(pred) + try: + gt_ids = util_3d.load_ids(gt_file) + except Exception as e: + util.print_error("unable to load " + gt_file + ": " + str(e)) + + # get gt instances + gt_instances = util_3d.get_instances( + gt_ids, VALID_CLASS_IDS, CLASS_LABELS, ID_TO_LABEL + ) + # associate + gt2pred = deepcopy(gt_instances) + for label in gt2pred: + for gt in gt2pred[label]: + gt["matched_pred"] = [] + pred2gt = {} + for label in CLASS_LABELS: + pred2gt[label] = [] + num_pred_instances = 0 + # mask of void labels in the groundtruth + bool_void = np.logical_not(np.in1d(gt_ids // 1000, VALID_CLASS_IDS)) + # go thru all prediction masks + for uuid in pred_info: + label_id = int(pred_info[uuid]["label_id"]) + conf = pred_info[uuid]["conf"] + if not label_id in ID_TO_LABEL: + continue + label_name = ID_TO_LABEL[label_id] + # read the mask + pred_mask = pred_info[uuid]["mask"] + assert len(pred_mask) == len(gt_ids) + # convert to binary + pred_mask = np.not_equal(pred_mask, 0) + num = np.count_nonzero(pred_mask) + if num < opt["min_region_sizes"][0]: + continue # skip if empty + + pred_instance = {} + pred_instance["uuid"] = uuid + pred_instance["pred_id"] = num_pred_instances + pred_instance["label_id"] = label_id + pred_instance["vert_count"] = num + pred_instance["confidence"] = conf + pred_instance["void_intersection"] = np.count_nonzero( + np.logical_and(bool_void, pred_mask) + ) + + # matched gt instances + matched_gt = [] + # go thru all gt instances with matching label + for (gt_num, gt_inst) in enumerate(gt2pred[label_name]): + intersection = np.count_nonzero( + np.logical_and(gt_ids == gt_inst["instance_id"], pred_mask) + ) + if intersection > 0: + gt_copy = gt_inst.copy() + pred_copy = pred_instance.copy() + gt_copy["intersection"] = intersection + pred_copy["intersection"] = intersection + matched_gt.append(gt_copy) + gt2pred[label_name][gt_num]["matched_pred"].append(pred_copy) + pred_instance["matched_gt"] = matched_gt + num_pred_instances += 1 + pred2gt[label_name].append(pred_instance) + + return gt2pred, pred2gt + + +def print_results(avgs): + sep = "" + col1 = ":" + lineLen = 64 + + print("") + print("#" * lineLen) + line = "" + line += "{:<15}".format("what") + sep + col1 + line += "{:>15}".format("AP") + sep + line += "{:>15}".format("AP_50%") + sep + line += "{:>15}".format("AP_25%") + sep + print(line) + print("#" * lineLen) + + for (li, label_name) in enumerate(CLASS_LABELS): + ap_avg = avgs["classes"][label_name]["ap"] + ap_50o = avgs["classes"][label_name]["ap50%"] + ap_25o = avgs["classes"][label_name]["ap25%"] + line = "{:<15}".format(label_name) + sep + col1 + line += sep + "{:>15.3f}".format(ap_avg) + sep + line += sep + "{:>15.3f}".format(ap_50o) + sep + line += sep + "{:>15.3f}".format(ap_25o) + sep + print(line) + + all_ap_avg = avgs["all_ap"] + all_ap_50o = avgs["all_ap_50%"] + all_ap_25o = avgs["all_ap_25%"] + + print("-" * lineLen) + line = "{:<15}".format("average") + sep + col1 + line += "{:>15.3f}".format(all_ap_avg) + sep + line += "{:>15.3f}".format(all_ap_50o) + sep + line += "{:>15.3f}".format(all_ap_25o) + sep + print(line) + print("") + + +def write_result_file(avgs, filename): + _SPLITTER = "," + with open(filename, "w") as f: + f.write( + _SPLITTER.join(["class", "class id", "ap", "ap50", "ap25"]) + "\n" + ) + for i in range(len(VALID_CLASS_IDS)): + class_name = CLASS_LABELS[i] + class_id = VALID_CLASS_IDS[i] + ap = avgs["classes"][class_name]["ap"] + ap50 = avgs["classes"][class_name]["ap50%"] + ap25 = avgs["classes"][class_name]["ap25%"] + f.write( + _SPLITTER.join( + [str(x) for x in [class_name, class_id, ap, ap50, ap25]] + ) + + "\n" + ) + + +def evaluate( + preds: dict, gt_path: str, output_file: str, dataset: str = "scannet" +): + global CLASS_LABELS + global VALID_CLASS_IDS + global ID_TO_LABEL + global LABEL_TO_ID + global opt + + if dataset == "stpls3d": + # global CLASS_LABELS + # global VALID_CLASS_IDS + # global ID_TO_LABEL + # global LABEL_TO_ID + + opt["min_region_sizes"] = np.array([10]) + + CLASS_LABELS = [ + "Build", + "LowVeg", + "MediumVeg", + "HighVeg", + "Vehicle", + "Truck", + "Aircraft", + "MilitaryVeh", + "Bike", + "Motorcycle", + "LightPole", + "StreetSign", + "Clutter", + "Fence", + ] + VALID_CLASS_IDS = np.array( + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] + ) + + ID_TO_LABEL = {} + LABEL_TO_ID = {} + for i in range(len(VALID_CLASS_IDS)): + LABEL_TO_ID[CLASS_LABELS[i]] = VALID_CLASS_IDS[i] + ID_TO_LABEL[VALID_CLASS_IDS[i]] = CLASS_LABELS[i] + + if dataset == "s3dis": + # global CLASS_LABELS + # global VALID_CLASS_IDS + # global ID_TO_LABEL + # global LABEL_TO_ID + + CLASS_LABELS = [ + "ceiling", + "floor", + "wall", + "beam", + "column", + "window", + "door", + "table", + "chair", + "sofa", + "bookcase", + "board", + "clutter", + ] + VALID_CLASS_IDS = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) + ID_TO_LABEL = {} + LABEL_TO_ID = {} + for i in range(len(VALID_CLASS_IDS)): + LABEL_TO_ID[CLASS_LABELS[i]] = VALID_CLASS_IDS[i] + ID_TO_LABEL[VALID_CLASS_IDS[i]] = CLASS_LABELS[i] + + if dataset == "scannet200": + CLASS_LABELS = ( + "chair", + "table", + "door", + "couch", + "cabinet", + "shelf", + "desk", + "office chair", + "bed", + "pillow", + "sink", + "picture", + "window", + "toilet", + "bookshelf", + "monitor", + "curtain", + "book", + "armchair", + "coffee table", + "box", + "refrigerator", + "lamp", + "kitchen cabinet", + "towel", + "clothes", + "tv", + "nightstand", + "counter", + "dresser", + "stool", + "cushion", + "plant", + "ceiling", + "bathtub", + "end table", + "dining table", + "keyboard", + "bag", + "backpack", + "toilet paper", + "printer", + "tv stand", + "whiteboard", + "blanket", + "shower curtain", + "trash can", + "closet", + "stairs", + "microwave", + "stove", + "shoe", + "computer tower", + "bottle", + "bin", + "ottoman", + "bench", + "board", + "washing machine", + "mirror", + "copier", + "basket", + "sofa chair", + "file cabinet", + "fan", + "laptop", + "shower", + "paper", + "person", + "paper towel dispenser", + "oven", + "blinds", + "rack", + "plate", + "blackboard", + "piano", + "suitcase", + "rail", + "radiator", + "recycling bin", + "container", + "wardrobe", + "soap dispenser", + "telephone", + "bucket", + "clock", + "stand", + "light", + "laundry basket", + "pipe", + "clothes dryer", + "guitar", + "toilet paper holder", + "seat", + "speaker", + "column", + "bicycle", + "ladder", + "bathroom stall", + "shower wall", + "cup", + "jacket", + "storage bin", + "coffee maker", + "dishwasher", + "paper towel roll", + "machine", + "mat", + "windowsill", + "bar", + "toaster", + "bulletin board", + "ironing board", + "fireplace", + "soap dish", + "kitchen counter", + "doorframe", + "toilet paper dispenser", + "mini fridge", + "fire extinguisher", + "ball", + "hat", + "shower curtain rod", + "water cooler", + "paper cutter", + "tray", + "shower door", + "pillar", + "ledge", + "toaster oven", + "mouse", + "toilet seat cover dispenser", + "furniture", + "cart", + "storage container", + "scale", + "tissue box", + "light switch", + "crate", + "power outlet", + "decoration", + "sign", + "projector", + "closet door", + "vacuum cleaner", + "candle", + "plunger", + "stuffed animal", + "headphones", + "dish rack", + "broom", + "guitar case", + "range hood", + "dustpan", + "hair dryer", + "water bottle", + "handicap bar", + "purse", + "vent", + "shower floor", + "water pitcher", + "mailbox", + "bowl", + "paper bag", + "alarm clock", + "music stand", + "projector screen", + "divider", + "laundry detergent", + "bathroom counter", + "object", + "bathroom vanity", + "closet wall", + "laundry hamper", + "bathroom stall door", + "ceiling light", + "trash bin", + "dumbbell", + "stair rail", + "tube", + "bathroom cabinet", + "cd case", + "closet rod", + "coffee kettle", + "structure", + "shower head", + "keyboard piano", + "case of water bottles", + "coat rack", + "storage organizer", + "folded chair", + "fire alarm", + "power strip", + "calendar", + "poster", + "potted plant", + "luggage", + "mattress", + ) + + VALID_CLASS_IDS = np.array( + ( + 2, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 21, + 22, + 23, + 24, + 26, + 27, + 28, + 29, + 31, + 32, + 33, + 34, + 35, + 36, + 38, + 39, + 40, + 41, + 42, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 54, + 55, + 56, + 57, + 58, + 59, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 82, + 84, + 86, + 87, + 88, + 89, + 90, + 93, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 110, + 112, + 115, + 116, + 118, + 120, + 121, + 122, + 125, + 128, + 130, + 131, + 132, + 134, + 136, + 138, + 139, + 140, + 141, + 145, + 148, + 154, + 155, + 156, + 157, + 159, + 161, + 163, + 165, + 166, + 168, + 169, + 170, + 177, + 180, + 185, + 188, + 191, + 193, + 195, + 202, + 208, + 213, + 214, + 221, + 229, + 230, + 232, + 233, + 242, + 250, + 261, + 264, + 276, + 283, + 286, + 300, + 304, + 312, + 323, + 325, + 331, + 342, + 356, + 370, + 392, + 395, + 399, + 408, + 417, + 488, + 540, + 562, + 570, + 572, + 581, + 609, + 748, + 776, + 1156, + 1163, + 1164, + 1165, + 1166, + 1167, + 1168, + 1169, + 1170, + 1171, + 1172, + 1173, + 1174, + 1175, + 1176, + 1178, + 1179, + 1180, + 1181, + 1182, + 1183, + 1184, + 1185, + 1186, + 1187, + 1188, + 1189, + 1190, + 1191, + ) + ) + + ID_TO_LABEL = {} + LABEL_TO_ID = {} + for i in range(len(VALID_CLASS_IDS)): + LABEL_TO_ID[CLASS_LABELS[i]] = VALID_CLASS_IDS[i] + ID_TO_LABEL[VALID_CLASS_IDS[i]] = CLASS_LABELS[i] + + total_true = 0 + total_seen = 0 + NUM_CLASSES = len(VALID_CLASS_IDS) + + true_positive_classes = np.zeros(NUM_CLASSES) + positive_classes = np.zeros(NUM_CLASSES) + gt_classes = np.zeros(NUM_CLASSES) + + # precision & recall + total_gt_ins = np.zeros(NUM_CLASSES) + at = 0.5 + tpsins = [[] for _ in range(NUM_CLASSES)] + fpsins = [[] for _ in range(NUM_CLASSES)] + # mucov and mwcov + all_mean_cov = [[] for _ in range(NUM_CLASSES)] + all_mean_weighted_cov = [[] for _ in range(NUM_CLASSES)] + + print("evaluating", len(preds), "scans...") + matches = {} + for i, (k, v) in enumerate(preds.items()): + gt_file = os.path.join(gt_path, k + ".txt") + if not os.path.isfile(gt_file): + util.print_error( + "Scan {} does not match any gt file".format(k), user_fault=True + ) + + if dataset == "s3dis": + gt_ids = util_3d.load_ids(gt_file) + gt_sem = (gt_ids // 1000) - 1 + gt_ins = gt_ids - (gt_ids // 1000) * 1000 + + # pred_sem = v['pred_classes'] - 1 + pred_sem = np.zeros(v["pred_masks"].shape[0], dtype=np.int) + # TODO CONTINUE HERE!!!!!!!!!!!!! + pred_ins = np.zeros(v["pred_masks"].shape[0], dtype=np.int) + + for inst_id in reversed(range(v["pred_masks"].shape[1])): + point_ids = np.argwhere(v["pred_masks"][:, inst_id] == 1.0)[ + :, 0 + ] + pred_ins[point_ids] = inst_id + 1 + pred_sem[point_ids] = v["pred_classes"][inst_id] - 1 + + # semantic acc + total_true += np.sum(pred_sem == gt_sem) + total_seen += pred_sem.shape[0] + + # TODO PARALLELIZ THIS!!!!!!! + # pn semantic mIoU + """ + for j in range(gt_sem.shape[0]): + gt_l = int(gt_sem[j]) + pred_l = int(pred_sem[j]) + gt_classes[gt_l] += 1 + positive_classes[pred_l] += 1 + true_positive_classes[gt_l] += int(gt_l == pred_l) + """ + + uniq, counts = np.unique(pred_sem, return_counts=True) + positive_classes[uniq] += counts + + uniq, counts = np.unique(gt_sem, return_counts=True) + gt_classes[uniq] += counts + + uniq, counts = np.unique( + gt_sem[pred_sem == gt_sem], return_counts=True + ) + true_positive_classes[uniq] += counts + + # instance + un = np.unique(pred_ins) + pts_in_pred = [[] for _ in range(NUM_CLASSES)] + for ig, g in enumerate(un): # each object in prediction + if g == -1: + continue + tmp = pred_ins == g + sem_seg_i = int(stats.mode(pred_sem[tmp])[0]) + pts_in_pred[sem_seg_i] += [tmp] + + un = np.unique(gt_ins) + pts_in_gt = [[] for _ in range(NUM_CLASSES)] + for ig, g in enumerate(un): + tmp = gt_ins == g + sem_seg_i = int(stats.mode(gt_sem[tmp])[0]) + pts_in_gt[sem_seg_i] += [tmp] + + # instance mucov & mwcov + for i_sem in range(NUM_CLASSES): + sum_cov = 0 + mean_cov = 0 + mean_weighted_cov = 0 + num_gt_point = 0 + for ig, ins_gt in enumerate(pts_in_gt[i_sem]): + ovmax = 0.0 + num_ins_gt_point = np.sum(ins_gt) + num_gt_point += num_ins_gt_point + for ip, ins_pred in enumerate(pts_in_pred[i_sem]): + union = ins_pred | ins_gt + intersect = ins_pred & ins_gt + iou = float(np.sum(intersect)) / np.sum(union) + + if iou > ovmax: + ovmax = iou + ipmax = ip + + sum_cov += ovmax + mean_weighted_cov += ovmax * num_ins_gt_point + + if len(pts_in_gt[i_sem]) != 0: + mean_cov = sum_cov / len(pts_in_gt[i_sem]) + all_mean_cov[i_sem].append(mean_cov) + + mean_weighted_cov /= num_gt_point + all_mean_weighted_cov[i_sem].append(mean_weighted_cov) + + if dataset == "s3dis": + # instance precision & recall + for i_sem in range(NUM_CLASSES): + tp = [0.0] * len(pts_in_pred[i_sem]) + fp = [0.0] * len(pts_in_pred[i_sem]) + gtflag = np.zeros(len(pts_in_gt[i_sem])) + total_gt_ins[i_sem] += len(pts_in_gt[i_sem]) + + for ip, ins_pred in enumerate(pts_in_pred[i_sem]): + ovmax = -1.0 + + for ig, ins_gt in enumerate(pts_in_gt[i_sem]): + union = ins_pred | ins_gt + intersect = ins_pred & ins_gt + iou = float(np.sum(intersect)) / np.sum(union) + + if iou > ovmax: + ovmax = iou + igmax = ig + + if ovmax >= at: + tp[ip] = 1 # true + else: + fp[ip] = 1 # false positive + + tpsins[i_sem] += tp + fpsins[i_sem] += fp + + matches_key = os.path.abspath(gt_file) + # assign gt to predictions + gt2pred, pred2gt = assign_instances_for_scan(v, gt_file) + matches[matches_key] = {} + matches[matches_key]["gt"] = gt2pred + matches[matches_key]["pred"] = pred2gt + sys.stdout.write("\rscans processed: {}".format(i + 1)) + sys.stdout.flush() + print("") + ap_scores = evaluate_matches(matches) + avgs = compute_averages(ap_scores) + + # print + print_results(avgs) + write_result_file(avgs, output_file) + + if dataset == "s3dis": + MUCov = np.zeros(NUM_CLASSES) + MWCov = np.zeros(NUM_CLASSES) + for i_sem in range(NUM_CLASSES): + MUCov[i_sem] = np.mean(all_mean_cov[i_sem]) + MWCov[i_sem] = np.mean(all_mean_weighted_cov[i_sem]) + + precision = np.zeros(NUM_CLASSES) + recall = np.zeros(NUM_CLASSES) + for i_sem in range(NUM_CLASSES): + tp = np.asarray(tpsins[i_sem]).astype(np.float) + fp = np.asarray(fpsins[i_sem]).astype(np.float) + tp = np.sum(tp) + fp = np.sum(fp) + rec = tp / total_gt_ins[i_sem] + prec = tp / (tp + fp) + + precision[i_sem] = prec + recall[i_sem] = rec + + """ + LOG_FOUT = open(os.path.join('results_a5.txt'), 'w') + + def log_string(out_str): + LOG_FOUT.write(out_str + '\n') + LOG_FOUT.flush() + print(out_str) + """ + + return np.mean(precision), np.mean(recall) + + +# TODO: remove this +# import pandas as pd +# def main(): +# print("!!! CLI is only for debugging purposes. use `evaluate()` instead.") +# evaluate(pd.read_pickle("/globalwork/schult/saved_predictions.pkl"), opt.gt_path, opt.output_file) + +# if __name__ == '__main__': +# main() diff --git a/models/Mask3D/mask3d/benchmark/util.py b/models/Mask3D/mask3d/benchmark/util.py new file mode 100644 index 0000000000000000000000000000000000000000..9a4224cd4f785c8a5a7cde490cf0f9999e61dbe7 --- /dev/null +++ b/models/Mask3D/mask3d/benchmark/util.py @@ -0,0 +1,128 @@ +import os, sys +import csv + +try: + import numpy as np +except: + print("Failed to import numpy package.") + sys.exit(-1) +try: + import imageio +except: + print("Please install the module 'imageio' for image processing, e.g.") + print("pip install imageio") + sys.exit(-1) + +# print an error message and quit +def print_error(message, user_fault=False): + sys.stderr.write("ERROR: " + str(message) + "\n") + if user_fault: + sys.exit(2) + sys.exit(-1) + + +# if string s represents an int +def represents_int(s): + try: + int(s) + return True + except ValueError: + return False + + +def read_label_mapping( + filename, label_from="raw_category", label_to="nyu40id" +): + assert os.path.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter="\t") + for row in reader: + mapping[row[label_from]] = int(row[label_to]) + # if ints convert + if represents_int(list(mapping.keys())[0]): + mapping = {int(k): v for k, v in mapping.items()} + return mapping + + +# input: scene_types.txt or scene_types_all.txt +def read_scene_types_mapping(filename, remove_spaces=True): + assert os.path.isfile(filename) + mapping = dict() + lines = open(filename).read().splitlines() + lines = [line.split("\t") for line in lines] + if remove_spaces: + mapping = {x[1].strip(): int(x[0]) for x in lines} + else: + mapping = {x[1]: int(x[0]) for x in lines} + return mapping + + +# color by label +def visualize_label_image(filename, image): + height = image.shape[0] + width = image.shape[1] + vis_image = np.zeros([height, width, 3], dtype=np.uint8) + color_palette = create_color_palette() + for idx, color in enumerate(color_palette): + vis_image[image == idx] = color + imageio.imwrite(filename, vis_image) + + +# color by different instances (mod length of color palette) +def visualize_instance_image(filename, image): + height = image.shape[0] + width = image.shape[1] + vis_image = np.zeros([height, width, 3], dtype=np.uint8) + color_palette = create_color_palette() + instances = np.unique(image) + for idx, inst in enumerate(instances): + vis_image[image == inst] = color_palette[inst % len(color_palette)] + imageio.imwrite(filename, vis_image) + + +# color palette for nyu40 labels +def create_color_palette(): + return [ + (0, 0, 0), + (174, 199, 232), # wall + (152, 223, 138), # floor + (31, 119, 180), # cabinet + (255, 187, 120), # bed + (188, 189, 34), # chair + (140, 86, 75), # sofa + (255, 152, 150), # table + (214, 39, 40), # door + (197, 176, 213), # window + (148, 103, 189), # bookshelf + (196, 156, 148), # picture + (23, 190, 207), # counter + (178, 76, 76), + (247, 182, 210), # desk + (66, 188, 102), + (219, 219, 141), # curtain + (140, 57, 197), + (202, 185, 52), + (51, 176, 203), + (200, 54, 131), + (92, 193, 61), + (78, 71, 183), + (172, 114, 82), + (255, 127, 14), # refrigerator + (91, 163, 138), + (153, 98, 156), + (140, 153, 101), + (158, 218, 229), # shower curtain + (100, 125, 154), + (178, 127, 135), + (120, 185, 128), + (146, 111, 194), + (44, 160, 44), # toilet + (112, 128, 144), # sink + (96, 207, 209), + (227, 119, 194), # bathtub + (213, 92, 176), + (94, 106, 211), + (82, 84, 163), # otherfurn + (100, 85, 144), + ] diff --git a/models/Mask3D/mask3d/benchmark/util_3d.py b/models/Mask3D/mask3d/benchmark/util_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..572064f3ca251563466ca6bfbe2c70dacdad205f --- /dev/null +++ b/models/Mask3D/mask3d/benchmark/util_3d.py @@ -0,0 +1,177 @@ +import os, sys +import json + +try: + import numpy as np +except: + print("Failed to import numpy package.") + sys.exit(-1) + +try: + from plyfile import PlyData, PlyElement +except: + print("Please install the module 'plyfile' for PLY i/o, e.g.") + print("pip install plyfile") + sys.exit(-1) + +import benchmark.util as util + + +# matrix: 4x4 np array +# points Nx3 np array +def transform_points(matrix, points): + assert len(points.shape) == 2 and points.shape[1] == 3 + num_points = points.shape[0] + p = np.concatenate([points, np.ones((num_points, 1))], axis=1) + p = np.matmul(matrix, np.transpose(p)) + p = np.transpose(p) + p[:, :3] /= p[:, 3, None] + return p[:, :3] + + +def export_ids(filename, ids): + with open(filename, "w") as f: + for id in ids: + f.write("%d\n" % id) + + +def load_ids(filename): + ids = open(filename).read().splitlines() + ids = np.array(ids, dtype=np.int64) + return ids + + +def read_mesh_vertices(filename): + assert os.path.isfile(filename) + with open(filename, "rb") as f: + plydata = PlyData.read(f) + num_verts = plydata["vertex"].count + vertices = np.zeros(shape=[num_verts, 3], dtype=np.float32) + vertices[:, 0] = plydata["vertex"].data["x"] + vertices[:, 1] = plydata["vertex"].data["y"] + vertices[:, 2] = plydata["vertex"].data["z"] + return vertices + + +# export 3d instance labels for instance evaluation +def export_instance_ids_for_eval(filename, label_ids, instance_ids): + assert label_ids.shape[0] == instance_ids.shape[0] + output_mask_path_relative = "pred_mask" + name = os.path.splitext(os.path.basename(filename))[0] + output_mask_path = os.path.join( + os.path.dirname(filename), output_mask_path_relative + ) + if not os.path.isdir(output_mask_path): + os.mkdir(output_mask_path) + insts = np.unique(instance_ids) + zero_mask = np.zeros(shape=(instance_ids.shape[0]), dtype=np.int32) + with open(filename, "w") as f: + for idx, inst_id in enumerate(insts): + if inst_id == 0: # 0 -> no instance for this vertex + continue + output_mask_file = os.path.join( + output_mask_path_relative, name + "_" + str(idx) + ".txt" + ) + loc = np.where(instance_ids == inst_id) + label_id = label_ids[loc[0][0]] + f.write("%s %d %f\n" % (output_mask_file, label_id, 1.0)) + # write mask + mask = np.copy(zero_mask) + mask[loc[0]] = 1 + export_ids(output_mask_file, mask) + + +# ------------ Instance Utils ------------ # + + +class Instance(object): + instance_id = 0 + label_id = 0 + vert_count = 0 + med_dist = -1 + dist_conf = 0.0 + + def __init__(self, mesh_vert_instances, instance_id): + if instance_id == -1: + return + self.instance_id = int(instance_id) + self.label_id = int(self.get_label_id(instance_id)) + self.vert_count = int( + self.get_instance_verts(mesh_vert_instances, instance_id) + ) + + def get_label_id(self, instance_id): + return int(instance_id // 1000) + + def get_instance_verts(self, mesh_vert_instances, instance_id): + return (mesh_vert_instances == instance_id).sum() + + def to_json(self): + return json.dumps( + self, default=lambda o: o.__dict__, sort_keys=True, indent=4 + ) + + def to_dict(self): + dict = {} + dict["instance_id"] = self.instance_id + dict["label_id"] = self.label_id + dict["vert_count"] = self.vert_count + dict["med_dist"] = self.med_dist + dict["dist_conf"] = self.dist_conf + return dict + + def from_json(self, data): + self.instance_id = int(data["instance_id"]) + self.label_id = int(data["label_id"]) + self.vert_count = int(data["vert_count"]) + if "med_dist" in data: + self.med_dist = float(data["med_dist"]) + self.dist_conf = float(data["dist_conf"]) + + def __str__(self): + return "(" + str(self.instance_id) + ")" + + +def read_instance_prediction_file(filename, pred_path): + lines = open(filename).read().splitlines() + instance_info = {} + abs_pred_path = os.path.abspath(pred_path) + for line in lines: + parts = line.split(" ") + if len(parts) != 3: + util.print_error( + "invalid instance prediction file. Expected (per line): [rel path prediction] [label id prediction] [confidence prediction]" + ) + if os.path.isabs(parts[0]): + util.print_error( + "invalid instance prediction file. First entry in line must be a relative path" + ) + mask_file = os.path.join(os.path.dirname(filename), parts[0]) + mask_file = os.path.abspath(mask_file) + # check that mask_file lives inside prediction path + if os.path.commonprefix([mask_file, abs_pred_path]) != abs_pred_path: + util.print_error( + "predicted mask {} in prediction text file {} points outside of prediction path.".format( + mask_file, filename + ) + ) + + info = {} + info["label_id"] = int(float(parts[1])) + info["conf"] = float(parts[2]) + instance_info[mask_file] = info + return instance_info + + +def get_instances(ids, class_ids, class_labels, id2label): + instances = {} + for label in class_labels: + instances[label] = [] + instance_ids = np.unique(ids) + for id in instance_ids: + if id == 0: + continue + inst = Instance(ids, id) + if inst.label_id in class_ids: + instances[id2label[inst.label_id]].append(inst.to_dict()) + return instances diff --git a/models/Mask3D/mask3d/conf/__init__.py b/models/Mask3D/mask3d/conf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/mask3d/conf/augmentation/albumentations_aug.yaml b/models/Mask3D/mask3d/conf/augmentation/albumentations_aug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..006663b4be251bf0f41ac2f66f855ae3d59a2878 --- /dev/null +++ b/models/Mask3D/mask3d/conf/augmentation/albumentations_aug.yaml @@ -0,0 +1,30 @@ +__version__: 0.4.5 +transform: + __class_fullname__: albumentations.core.composition.Compose + additional_targets: {} + bbox_params: null + keypoint_params: null + p: 1.0 + transforms: + - __class_fullname__: albumentations.augmentations.transforms.RandomBrightnessContrast + always_apply: true + brightness_by_max: true + brightness_limit: + - -0.2 + - 0.2 + contrast_limit: + - -0.2 + - 0.2 + p: 0.5 + - __class_fullname__: albumentations.augmentations.transforms.RGBShift + always_apply: true + b_shift_limit: + - -20 + - 20 + g_shift_limit: + - -20 + - 20 + p: 0.5 + r_shift_limit: + - -20 + - 20 diff --git a/models/Mask3D/mask3d/conf/augmentation/volumentations_aug.yaml b/models/Mask3D/mask3d/conf/augmentation/volumentations_aug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3b86407a2e735ad8dbba79f83746ceb79722aedf --- /dev/null +++ b/models/Mask3D/mask3d/conf/augmentation/volumentations_aug.yaml @@ -0,0 +1,53 @@ +# pi = 3.14159265358979 +# pi/2 = 1.57079632679489 +# pi/3 = 1.04719755119659 +# pi/6 = 0.52359877559829 +# pi/12 = 0.26179938779914 +# pi/24 = 0.13089969389957 +# +__version__: 0.1.6 +transform: + __class_fullname__: volumentations.core.composition.Compose + additional_targets: {} + p: 1.0 + transforms: + - __class_fullname__: volumentations.augmentations.transforms.Scale3d + always_apply: true + p: 0.5 + scale_limit: + - - -0.1 + - 0.1 + - - -0.1 + - 0.1 + - - -0.1 + - 0.1 + - __class_fullname__: volumentations.augmentations.transforms.RotateAroundAxis3d + always_apply: true + axis: + - 0 + - 0 + - 1 + p: 0.5 + rotation_limit: + - -3.141592653589793 + - 3.141592653589793 + - __class_fullname__: volumentations.augmentations.transforms.RotateAroundAxis3d + always_apply: true + axis: + - 0 + - 1 + - 0 + p: 0.5 + rotation_limit: + - -0.13089969389957 + - 0.13089969389957 + - __class_fullname__: volumentations.augmentations.transforms.RotateAroundAxis3d + always_apply: true + axis: + - 1 + - 0 + - 0 + p: 0.5 + rotation_limit: + - -0.13089969389957 + - 0.13089969389957 diff --git a/models/Mask3D/mask3d/conf/callbacks/callbacks_instance_segmentation.yaml b/models/Mask3D/mask3d/conf/callbacks/callbacks_instance_segmentation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7f0958eed35ea4317ddc3f2378dd66336472c0fa --- /dev/null +++ b/models/Mask3D/mask3d/conf/callbacks/callbacks_instance_segmentation.yaml @@ -0,0 +1,11 @@ +# @package _group_ +- _target_: pytorch_lightning.callbacks.ModelCheckpoint + monitor: val_mean_ap_50 + save_last: true + save_top_k: 1 + mode: max + dirpath: ${general.save_dir} + filename: "{epoch}-{val_mean_ap_50:.3f}" + every_n_epochs: 1 + +- _target_: pytorch_lightning.callbacks.LearningRateMonitor diff --git a/models/Mask3D/mask3d/conf/config_base_instance_segmentation.yaml b/models/Mask3D/mask3d/conf/config_base_instance_segmentation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..61aeae0519bd308a58293d07ee902beb6a64ed5d --- /dev/null +++ b/models/Mask3D/mask3d/conf/config_base_instance_segmentation.yaml @@ -0,0 +1,75 @@ +general: + train_mode: true + task: "instance_segmentation" + seed: null + checkpoint: null + backbone_checkpoint: null + freeze_backbone: false # train only last layer + linear_probing_backbone: false + train_on_segments: false + eval_on_segments: false + filter_out_instances: false + save_visualizations: false + visualization_point_size: 20 + decoder_id: -1 + export: false + use_dbscan: false + ignore_class_threshold: 100 + project_name: scannet + workspace: jonasschult + experiment_name: DEBUG_ABLATION + num_targets: 19 + add_instance: true + dbscan_eps: 0.95 + dbscan_min_points: 1 + + + export_threshold: 0.0001 + + reps_per_epoch: 1 + + on_crops: false + + scores_threshold: 0.0 + iou_threshold: 1.0 + + area: 5 + + eval_inner_core: -1 # disabled + + topk_per_image: 100 + + ignore_mask_idx: [] + + max_batch_size: 99999999 + + save_dir: saved/${general.experiment_name} + # time/commit/md5(config)_uuid + # time/experiment_id/version_uuid + # experiment_id: 1 # commit[:8], or unique from logger + # version: 1 # md5[:8] of config + + gpus: 1 + +defaults: + - data: indoor + - data/data_loaders: simple_loader + - data/datasets: scannet + - data/collation_functions: voxelize_collate + - logging: full + - model: mask3d + - metrics: miou + - optimizer: adamw + - scheduler: onecyclelr + - trainer: trainer600 + - callbacks: callbacks_instance_segmentation + - matcher: hungarian_matcher + - loss: set_criterion + +hydra: + run: + dir: saved/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: saved/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} + # dir: ${general.save_dir} + subdir: ${hydra.job.num}_${hydra.job.id} diff --git a/models/Mask3D/mask3d/conf/data/collation_functions/voxelize_collate.yaml b/models/Mask3D/mask3d/conf/data/collation_functions/voxelize_collate.yaml new file mode 100644 index 0000000000000000000000000000000000000000..026552efb024e4e6fd90bf6bda9df283da2bf4c1 --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/collation_functions/voxelize_collate.yaml @@ -0,0 +1,42 @@ +# @package data + +train_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.train_mode} + small_crops: false + very_small_crops: false + batch_instance: false + probing: ${general.linear_probing_backbone} + task: ${general.task} + ignore_class_threshold: ${general.ignore_class_threshold} + filter_out_classes: ${data.train_dataset.filter_out_classes} + label_offset: ${data.train_dataset.label_offset} + num_queries: ${model.num_queries} + +validation_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.validation_mode} + batch_instance: false + probing: ${general.linear_probing_backbone} + task: ${general.task} + ignore_class_threshold: ${general.ignore_class_threshold} + filter_out_classes: ${data.validation_dataset.filter_out_classes} + label_offset: ${data.validation_dataset.label_offset} + num_queries: ${model.num_queries} + +test_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.test_mode} + batch_instance: false + probing: ${general.linear_probing_backbone} + task: ${general.task} + ignore_class_threshold: ${general.ignore_class_threshold} + filter_out_classes: ${data.test_dataset.filter_out_classes} + label_offset: ${data.test_dataset.label_offset} + num_queries: ${model.num_queries} \ No newline at end of file diff --git a/models/Mask3D/mask3d/conf/data/collation_functions/voxelize_collate_merge.yaml b/models/Mask3D/mask3d/conf/data/collation_functions/voxelize_collate_merge.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d5d3471d143ddfe999d8f3031e41ba6efce2e879 --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/collation_functions/voxelize_collate_merge.yaml @@ -0,0 +1,36 @@ +# @package data + +train_collation: + _target_: mask3d.datasets.utils.VoxelizeCollateMerge + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.train_mode} + small_crops: false + very_small_crops: false + scenes: 2 + batch_instance: false + make_one_pc_noise: false + place_nearby: false + place_far: false + proba: 1 + probing: ${general.linear_probing_backbone} + include_ignore: ${general.include_ignore} + task: ${general.task} + +validation_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.validation_mode} + probing: ${general.linear_probing_backbone} + include_ignore: ${general.include_ignore} + task: ${general.task} + +test_collation: + _target_: mask3d.datasets.utils.VoxelizeCollate + ignore_label: ${data.ignore_label} + voxel_size: ${data.voxel_size} + mode: ${data.test_mode} + probing: ${general.linear_probing_backbone} + include_ignore: ${general.include_ignore} + task: ${general.task} diff --git a/models/Mask3D/mask3d/conf/data/data_loaders/simple_loader.yaml b/models/Mask3D/mask3d/conf/data/data_loaders/simple_loader.yaml new file mode 100644 index 0000000000000000000000000000000000000000..39996e14d769c2ba9341da582a1f7bf970fc7925 --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/data_loaders/simple_loader.yaml @@ -0,0 +1,22 @@ +# @package data + +train_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: true + pin_memory: ${data.pin_memory} + num_workers: ${data.num_workers} + batch_size: ${data.batch_size} + +validation_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: false + pin_memory: ${data.pin_memory} + num_workers: ${data.num_workers} + batch_size: ${data.test_batch_size} + +test_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: false + pin_memory: ${data.pin_memory} + num_workers: ${data.num_workers} + batch_size: ${data.test_batch_size} diff --git a/models/Mask3D/mask3d/conf/data/data_loaders/simple_loader_save_memory.yaml b/models/Mask3D/mask3d/conf/data/data_loaders/simple_loader_save_memory.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1b1b45d13167dc07357a13feb5a513dd71c9a2e --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/data_loaders/simple_loader_save_memory.yaml @@ -0,0 +1,22 @@ +# @package data + +train_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: true + pin_memory: ${data.pin_memory} + num_workers: ${data.num_workers} + batch_size: ${data.batch_size} + +validation_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: false + pin_memory: ${data.pin_memory} + num_workers: 1 + batch_size: ${data.test_batch_size} + +test_dataloader: + _target_: torch.utils.data.DataLoader + shuffle: false + pin_memory: ${data.pin_memory} + num_workers: 1 + batch_size: ${data.test_batch_size} diff --git a/models/Mask3D/mask3d/conf/data/datasets/matterport.yaml b/models/Mask3D/mask3d/conf/data/datasets/matterport.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6728ab9eb26bc78f435237d9d7d61800b900735d --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/datasets/matterport.yaml @@ -0,0 +1,48 @@ +# @package data +train_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/matterport + image_augmentations_path: mix3d/conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: mix3d/conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/matterport/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +validation_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/matterport/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +test_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/matterport + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/matterport/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} diff --git a/models/Mask3D/mask3d/conf/data/datasets/matterport_scannet.yaml b/models/Mask3D/mask3d/conf/data/datasets/matterport_scannet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..df259ceaadfa68a90c2b8a60d7b74a958b30c79d --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/datasets/matterport_scannet.yaml @@ -0,0 +1,50 @@ +# @package data +train_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: + - data/processed/scannet + - data/processed/matterport + image_augmentations_path: mix3d/conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: mix3d/conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +validation_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +test_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} diff --git a/models/Mask3D/mask3d/conf/data/datasets/rio.yaml b/models/Mask3D/mask3d/conf/data/datasets/rio.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1adfea36fea05b14a7fa95382677aee6144d1b4b --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/datasets/rio.yaml @@ -0,0 +1,48 @@ +# @package data +train_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/rio + image_augmentations_path: mix3d/conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: mix3d/conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +validation_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/rio + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + +test_dataset: + _target_: mix3d.datasets.semseg.SemanticSegmentationDataset + data_dir: data/processed/rio + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} diff --git a/models/Mask3D/mask3d/conf/data/datasets/s3dis.yaml b/models/Mask3D/mask3d/conf/data/datasets/s3dis.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2e1385416655514397d82737e1edc2d1a5997657 --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/datasets/s3dis.yaml @@ -0,0 +1,87 @@ +# @package data +train_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "s3dis" + data_dir: data/processed/s3dis + image_augmentations_path: conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/s3dis/label_database.yaml + color_mean_std: data/processed/s3dis/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + # different augs experiments + instance_oversampling: 0.0 + place_around_existing: False + point_per_cut: 0 + max_cut_region: 0 + flip_in_center: false + noise_rate: 0 + resample_points: 0 + cropping: ${data.cropping} + cropping_args: ${data.cropping_args} + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + filter_out_classes: [] + label_offset: 0 + +validation_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "s3dis" + data_dir: data/processed/s3dis + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/s3dis/label_database.yaml + color_mean_std: data/processed/s3dis/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + filter_out_classes: [] + label_offset: 0 + +test_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "s3dis" + data_dir: data/processed/s3dis + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/s3dis/label_database.yaml + color_mean_std: data/processed/s3dis/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + filter_out_classes: [] + label_offset: 0 diff --git a/models/Mask3D/mask3d/conf/data/datasets/scannet.yaml b/models/Mask3D/mask3d/conf/data/datasets/scannet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..50f1c6c5998d8f3c6dae35ef508225dff4b0271f --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/datasets/scannet.yaml @@ -0,0 +1,79 @@ +# @package data +train_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet" + data_dir: data/processed/scannet + image_augmentations_path: conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + # different augs experiments + instance_oversampling: 0.0 + place_around_existing: false + point_per_cut: 0 + max_cut_region: 0 + flip_in_center: false + noise_rate: 0 + resample_points: 0 + add_unlabeled_pc: false + cropping: ${data.cropping} + cropping_args: ${data.cropping_args} + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 1] + label_offset: 2 + +validation_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet" + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 1] + label_offset: 2 + +test_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet" + data_dir: data/processed/scannet + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/scannet/label_database.yaml + color_mean_std: data/processed/scannet/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 1] + label_offset: 2 diff --git a/models/Mask3D/mask3d/conf/data/datasets/scannet200.yaml b/models/Mask3D/mask3d/conf/data/datasets/scannet200.yaml new file mode 100644 index 0000000000000000000000000000000000000000..730a6ab9f1965004ec9828d1e8b2429005bef6f2 --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/datasets/scannet200.yaml @@ -0,0 +1,79 @@ +# @package data +train_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet200" + data_dir: /home/weders/scratch/scratch/scannetter/arkit/raw/ + image_augmentations_path: conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: conf/augmentation/volumentations_aug.yaml + # label_db_filepath: data/processed/scannet200/label_database.yaml + # color_mean_std: data/processed/scannet200/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + # different augs experiments + instance_oversampling: 0.0 + place_around_existing: false + point_per_cut: 0 + max_cut_region: 0 + flip_in_center: false + noise_rate: 0 + resample_points: 0 + add_unlabeled_pc: false + cropping: ${data.cropping} + cropping_args: ${data.cropping_args} + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 2] + label_offset: 2 + +validation_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet200" + data_dir: /home/weders/scratch/scratch/scannetter/arkit/raw/ + image_augmentations_path: null + volume_augmentations_path: null + # label_db_filepath: data/processed/scannet200/label_database.yaml + # color_mean_std: data/processed/scannet200/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 2] + label_offset: 2 + +test_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "scannet200" + data_dir: /home/weders/scratch/scratch/scannetter/arkit/raw/ + image_augmentations_path: null + volume_augmentations_path: null + # label_db_filepath: data/processed/scannet200/label_database.yaml + # color_mean_std: data/processed/scannet200/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + filter_out_classes: [0, 2] + label_offset: 2 diff --git a/models/Mask3D/mask3d/conf/data/datasets/semantic_kitti.yaml b/models/Mask3D/mask3d/conf/data/datasets/semantic_kitti.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9540ad610bd4a68d64369519d20e13009df9feda --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/datasets/semantic_kitti.yaml @@ -0,0 +1,42 @@ +# @package data +train_dataset: + _target_: mix3d.datasets.outdoor_semseg.LidarDataset + data_dir: data/processed/semantic_kitti + label_db_filepath: data/processed/semantic_kitti/label_database.yaml + mode: ${data.train_mode} + add_reflection: ${data.add_reflection} + add_distance: ${data.add_distance} + add_instance: ${data.add_instance} + num_labels: ${data.num_labels} + sweep: ${data.sweep} + data_percent: 1.0 + ignore_label: ${data.ignore_label} + volume_augmentations_path: mix3d/conf/augmentation/volumentations_aug.yaml + +validation_dataset: + _target_: mix3d.datasets.outdoor_semseg.LidarDataset + data_dir: data/processed/semantic_kitti + label_db_filepath: data/processed/semantic_kitti/label_database.yaml + mode: ${data.validation_mode} + add_reflection: ${data.add_reflection} + add_distance: ${data.add_distance} + add_instance: ${data.add_instance} + num_labels: ${data.num_labels} + sweep: ${data.sweep} + data_percent: 1.0 + ignore_label: ${data.ignore_label} + volume_augmentations_path: null + +test_dataset: + _target_: mix3d.datasets.outdoor_semseg.LidarDataset + data_dir: data/processed/semantic_kitti + label_db_filepath: data/processed/semantic_kitti/label_database.yaml + mode: ${data.test_mode} + add_reflection: ${data.add_reflection} + add_distance: ${data.add_distance} + add_instance: ${data.add_instance} + num_labels: ${data.num_labels} + sweep: ${data.sweep} + data_percent: 1.0 + ignore_label: ${data.ignore_label} + volume_augmentations_path: null diff --git a/models/Mask3D/mask3d/conf/data/datasets/stpls3d.yaml b/models/Mask3D/mask3d/conf/data/datasets/stpls3d.yaml new file mode 100644 index 0000000000000000000000000000000000000000..913667d4123a7edead9d948358ae25cf9f7b4bb1 --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/datasets/stpls3d.yaml @@ -0,0 +1,95 @@ +# @package data +train_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "stpls3d" + data_dir: data/processed/stpls3d + image_augmentations_path: conf/augmentation/albumentations_aug.yaml + volume_augmentations_path: conf/augmentation/volumentations_aug.yaml + label_db_filepath: data/processed/stpls3d/label_database.yaml + color_mean_std: data/processed/stpls3d/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.train_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + # different augs experiments + instance_oversampling: 0.0 + place_around_existing: False + point_per_cut: 0 + max_cut_region: 0 + flip_in_center: false + noise_rate: 0 + resample_points: 0 + cropping: ${data.cropping} + cropping_args: ${data.cropping_args} + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + reps_per_epoch: ${general.reps_per_epoch} + eval_inner_core: ${general.eval_inner_core} + filter_out_classes: [0] + label_offset: 1 + is_elastic_distortion: true + color_drop: 0.0 + +validation_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "stpls3d" + data_dir: data/processed/stpls3d + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/stpls3d/label_database.yaml + color_mean_std: data/processed/stpls3d/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.validation_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + on_crops: ${general.on_crops} + eval_inner_core: ${general.eval_inner_core} + filter_out_classes: [0] + label_offset: 1 + +test_dataset: + _target_: mask3d.datasets.semseg.SemanticSegmentationDataset + dataset_name: "stpls3d" + data_dir: data/processed/stpls3d + image_augmentations_path: null + volume_augmentations_path: null + label_db_filepath: data/processed/stpls3d/label_database.yaml + color_mean_std: data/processed/stpls3d/color_mean_std.yaml + data_percent: 1.0 + mode: ${data.test_mode} + ignore_label: ${data.ignore_label} + num_labels: ${data.num_labels} + add_raw_coordinates: ${data.add_raw_coordinates} + add_colors: ${data.add_colors} + add_normals: ${data.add_normals} + add_instance: ${data.add_instance} + cache_data: ${data.cache_data} + cropping: false + is_tta: false + crop_min_size: ${data.crop_min_size} + crop_length: ${data.crop_length} + cropping_v1: ${data.cropping_v1} + area: ${general.area} + on_crops: ${general.on_crops} + eval_inner_core: ${general.eval_inner_core} + filter_out_classes: [0] + label_offset: 1 diff --git a/models/Mask3D/mask3d/conf/data/indoor.yaml b/models/Mask3D/mask3d/conf/data/indoor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..868c37ccfe901f14396b68a38eac47b42cb3e812 --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/indoor.yaml @@ -0,0 +1,43 @@ +# @package _group_ + +# these parameters are inherited by datasets, data_loaders and collators +# but they might be overwritten + +# splits +train_mode: train +validation_mode: validation +test_mode: validation # test # validation + +# dataset +ignore_label: 255 +add_raw_coordinates: true # 3dim +add_colors: true # 3dim +add_normals: false # 3dim +in_channels: 3 # in_channels = 3 * (add_normals + add_colors + add_raw_coordinates) +num_labels: 20 +# num_labels: 41 +add_instance: ${general.add_instance} +task: ${general.task} + +# data loader +pin_memory: false +num_workers: 4 +batch_size: 5 +test_batch_size: 1 +cache_data: false + +# collation +voxel_size: 0.02 + +reps_per_epoch: ${general.reps_per_epoch} + +cropping: false +cropping_args: + min_points: 30000 + aspect: 0.8 + min_crop: 0.5 + max_crop: 1.0 + +crop_min_size: 20000 +crop_length: 6.0 +cropping_v1: true \ No newline at end of file diff --git a/models/Mask3D/mask3d/conf/data/outdoor.yaml b/models/Mask3D/mask3d/conf/data/outdoor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a77474f62d1cfb53f130160f641c65cb81a62956 --- /dev/null +++ b/models/Mask3D/mask3d/conf/data/outdoor.yaml @@ -0,0 +1,26 @@ +# @package _group_ + +# these parameters are inherited by datasets, data_loaders and collators +# but they might be overwritten + +# splits +train_mode: train +validation_mode: validation +test_mode: validation + +# dataset +ignore_label: 255 +add_distance: true # 1dim +add_reflection: true # 1dim +in_channels: 2 # in_channels = add_distance + add_reflection +num_labels: 19 +add_instance: false + +# data loader +pin_memory: true +num_workers: 4 +batch_size: 18 +sweep: 1 + +# collation +voxel_size: 0.15 diff --git a/models/Mask3D/mask3d/conf/logging/base.yaml b/models/Mask3D/mask3d/conf/logging/base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3d700a101ddf3d1e2c1a3cdea08190afff762a5b --- /dev/null +++ b/models/Mask3D/mask3d/conf/logging/base.yaml @@ -0,0 +1,10 @@ +# @package _group_ +- _target_: pytorch_lightning.loggers.NeptuneLogger + project_name: ${general.workspace}/${general.project_name} + experiment_name: ${general.experiment_name} + offline_mode: false + +- _target_: pytorch_lightning.loggers.CSVLogger + save_dir: ${general.save_dir} + name: ${general.experiment_id} + version: ${general.version} diff --git a/models/Mask3D/mask3d/conf/logging/full.yaml b/models/Mask3D/mask3d/conf/logging/full.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b434e94dc1f0889cf0829b5f89b8509717a3546c --- /dev/null +++ b/models/Mask3D/mask3d/conf/logging/full.yaml @@ -0,0 +1,8 @@ +# @package _group_ +- _target_: pytorch_lightning.loggers.WandbLogger + project: ${general.project_name} + name: ${general.experiment_name} + save_dir: ${general.save_dir} + entity: "schult" + resume: "allow" + id: ${general.experiment_name} diff --git a/models/Mask3D/mask3d/conf/logging/minimal.yaml b/models/Mask3D/mask3d/conf/logging/minimal.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1c46e26fefedcec50d4fdc9fc77c187d60cf7b9 --- /dev/null +++ b/models/Mask3D/mask3d/conf/logging/minimal.yaml @@ -0,0 +1,5 @@ +# @package _group_ +- _target_: pytorch_lightning.loggers.CSVLogger + save_dir: ${general.save_dir} + name: ${general.experiment_id} + version: ${general.version} diff --git a/models/Mask3D/mask3d/conf/logging/offline.yaml b/models/Mask3D/mask3d/conf/logging/offline.yaml new file mode 100644 index 0000000000000000000000000000000000000000..914ad19142ca22c3778be709208323908460ebac --- /dev/null +++ b/models/Mask3D/mask3d/conf/logging/offline.yaml @@ -0,0 +1,10 @@ +# @package _group_ +- _target_: pytorch_lightning.loggers.TensorBoardLogger + name: ${general.experiment_id} + version: ${general.version} + save_dir: ${general.save_dir} + +- _target_: pytorch_lightning.loggers.CSVLogger + name: ${general.experiment_id} + version: ${general.version} + save_dir: ${general.save_dir} \ No newline at end of file diff --git a/models/Mask3D/mask3d/conf/loss/cross_entropy.yaml b/models/Mask3D/mask3d/conf/loss/cross_entropy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c000f40ad2ab40605c244e38243a6e0cc7933768 --- /dev/null +++ b/models/Mask3D/mask3d/conf/loss/cross_entropy.yaml @@ -0,0 +1,3 @@ +# @package _group_ +_target_: torch.nn.CrossEntropyLoss +ignore_index: ${data.ignore_label} diff --git a/models/Mask3D/mask3d/conf/loss/set_criterion.yaml b/models/Mask3D/mask3d/conf/loss/set_criterion.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3c04ba49ce1823c2d6e923a03ae0514490d463e9 --- /dev/null +++ b/models/Mask3D/mask3d/conf/loss/set_criterion.yaml @@ -0,0 +1,11 @@ +# @package _group_ +_target_: mask3d.models.criterion.SetCriterion +num_classes: ${general.num_targets} +eos_coef: 0.1 +losses: + - "labels" + - "masks" +num_points: ${matcher.num_points} +oversample_ratio: 3.0 +importance_sample_ratio: 0.75 +class_weights: -1 diff --git a/models/Mask3D/mask3d/conf/loss/set_criterion_custom_weights_1.yaml b/models/Mask3D/mask3d/conf/loss/set_criterion_custom_weights_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1d2c308e081c1ffa61beb13308b27e6ff753f0f4 --- /dev/null +++ b/models/Mask3D/mask3d/conf/loss/set_criterion_custom_weights_1.yaml @@ -0,0 +1,11 @@ +# @package _group_ +_target_: mask3d.models.criterion.SetCriterion +num_classes: ${general.num_targets} +eos_coef: 0.1 +losses: + - "labels" + - "masks" +num_points: ${matcher.num_points} +oversample_ratio: 3.0 +importance_sample_ratio: 0.75 +class_weights: [1.0,1.5,10.0,1.0,1.0,1.0,1.0,1.0,10.0,10.0,1.0,10.0,1.0,1.0] diff --git a/models/Mask3D/mask3d/conf/matcher/hungarian_matcher.yaml b/models/Mask3D/mask3d/conf/matcher/hungarian_matcher.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47750b20906b6b40a131b702ba360e36ee4c8380 --- /dev/null +++ b/models/Mask3D/mask3d/conf/matcher/hungarian_matcher.yaml @@ -0,0 +1,6 @@ +# @package _group_ +_target_: mask3d.models.matcher.HungarianMatcher +cost_class: 2. +cost_mask: 5. +cost_dice: 2. +num_points: -1 diff --git a/models/Mask3D/mask3d/conf/metrics/miou.yaml b/models/Mask3D/mask3d/conf/metrics/miou.yaml new file mode 100644 index 0000000000000000000000000000000000000000..68d1b61181d9615d7d6d7638261d119a4fc47074 --- /dev/null +++ b/models/Mask3D/mask3d/conf/metrics/miou.yaml @@ -0,0 +1,4 @@ +# @package _group_ +_target_: mask3d.models.metrics.ConfusionMatrix +num_classes: ${data.num_labels} +ignore_label: ${data.ignore_label} diff --git a/models/Mask3D/mask3d/conf/model/mask3d.yaml b/models/Mask3D/mask3d/conf/model/mask3d.yaml new file mode 100644 index 0000000000000000000000000000000000000000..95718d8710477650561e0ddd845688f50c868032 --- /dev/null +++ b/models/Mask3D/mask3d/conf/model/mask3d.yaml @@ -0,0 +1,47 @@ +# @package _group_ +_target_: mask3d.models.Mask3D + +# transformer parameters +hidden_dim: 128 +dim_feedforward: 1024 +num_queries: 100 +num_heads: 8 +num_decoders: 3 +dropout: 0.0 +pre_norm: false +use_level_embed: false +normalize_pos_enc: true +positional_encoding_type: "fourier" +gauss_scale: 1.0 +hlevels: [0,1,2,3] + +# queries +non_parametric_queries: true +random_query_both: false +random_normal: false +random_queries: false +use_np_features: false + +# sampling +sample_sizes: [200, 800, 3200, 12800, 51200] +max_sample_size: false # change false means sampling activated + +shared_decoder: true +num_classes: ${general.num_targets} +train_on_segments: ${general.train_on_segments} +scatter_type: "mean" + +voxel_size: ${data.voxel_size} + +config: + backbone: + _target_: mask3d.models.Res16UNet34C + config: + dialations: [ 1, 1, 1, 1 ] + conv1_kernel_size: 5 + bn_momentum: 0.02 + # depends on normals, color, raw_coordinates + # varies from 3 to 9 + in_channels: ${data.in_channels} + out_channels: ${data.num_labels} + out_fpn: true diff --git a/models/Mask3D/mask3d/conf/optimizer/adamw.yaml b/models/Mask3D/mask3d/conf/optimizer/adamw.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4b4020d1ddd1444c94ea5bfbe1281c485fca587e --- /dev/null +++ b/models/Mask3D/mask3d/conf/optimizer/adamw.yaml @@ -0,0 +1,3 @@ +# @package _group_ +_target_: torch.optim.AdamW +lr: 0.0001 \ No newline at end of file diff --git a/models/Mask3D/mask3d/conf/optimizer/adamw_lower.yaml b/models/Mask3D/mask3d/conf/optimizer/adamw_lower.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e42f091a0d5dd03b66ab1dcec8b81d78a692af9 --- /dev/null +++ b/models/Mask3D/mask3d/conf/optimizer/adamw_lower.yaml @@ -0,0 +1,3 @@ +# @package _group_ +_target_: torch.optim.AdamW +lr: 0.005 diff --git a/models/Mask3D/mask3d/conf/scheduler/exponentiallr.yaml b/models/Mask3D/mask3d/conf/scheduler/exponentiallr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc5224083670b286d75fda46304560dbcca3aecb --- /dev/null +++ b/models/Mask3D/mask3d/conf/scheduler/exponentiallr.yaml @@ -0,0 +1,11 @@ +# @package _group_ + +scheduler: + _target_: torch.optim.lr_scheduler.ExponentialLR + gamma: 0.99999 + last_epoch: -1 # ${trainer.max_epochs} + # need to set to number because of tensorboard logger + # steps_per_epoch: -1 + +pytorch_lightning_params: + interval: step diff --git a/models/Mask3D/mask3d/conf/scheduler/lambdalr.yaml b/models/Mask3D/mask3d/conf/scheduler/lambdalr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b63f6f4333e98931ce22f1a38829de0ef51a3719 --- /dev/null +++ b/models/Mask3D/mask3d/conf/scheduler/lambdalr.yaml @@ -0,0 +1,8 @@ +# @package _group_ + +scheduler: + _target_: torch.optim.lr_scheduler.StepLR + step_size: 99999 + +pytorch_lightning_params: + interval: epoch diff --git a/models/Mask3D/mask3d/conf/scheduler/onecyclelr.yaml b/models/Mask3D/mask3d/conf/scheduler/onecyclelr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c788877193d7366c21088cf9fefb77e4f62ef4d9 --- /dev/null +++ b/models/Mask3D/mask3d/conf/scheduler/onecyclelr.yaml @@ -0,0 +1,11 @@ +# @package _group_ + +scheduler: + _target_: torch.optim.lr_scheduler.OneCycleLR + max_lr: ${optimizer.lr} + epochs: ${trainer.max_epochs} + # need to set to number because of tensorboard logger + steps_per_epoch: -1 + +pytorch_lightning_params: + interval: step diff --git a/models/Mask3D/mask3d/conf/trainer/trainer.yaml b/models/Mask3D/mask3d/conf/trainer/trainer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f436300f9ca6bbbe96ca6c1b4c7e8eeffe35fabd --- /dev/null +++ b/models/Mask3D/mask3d/conf/trainer/trainer.yaml @@ -0,0 +1,7 @@ +# @package _group_ +deterministic: false +max_epochs: 1000 +min_epochs: 1 +resume_from_checkpoint: null +check_val_every_n_epoch: 50 +num_sanity_val_steps: -1 diff --git a/models/Mask3D/mask3d/conf/trainer/trainer600.yaml b/models/Mask3D/mask3d/conf/trainer/trainer600.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc9f00295aafe3431d1c0e7ca50dbc29559ea134 --- /dev/null +++ b/models/Mask3D/mask3d/conf/trainer/trainer600.yaml @@ -0,0 +1,7 @@ +# @package _group_ +deterministic: false +max_epochs: 601 +min_epochs: 1 +resume_from_checkpoint: null +check_val_every_n_epoch: 50 +num_sanity_val_steps: 2 diff --git a/models/Mask3D/mask3d/datasets/__init__.py b/models/Mask3D/mask3d/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/mask3d/datasets/outdoor_semseg.py b/models/Mask3D/mask3d/datasets/outdoor_semseg.py new file mode 100644 index 0000000000000000000000000000000000000000..4592a6eda45c1a7626530eb19c42c267496749df --- /dev/null +++ b/models/Mask3D/mask3d/datasets/outdoor_semseg.py @@ -0,0 +1,206 @@ +import logging +from pathlib import Path +from typing import List, Optional, Union, Tuple +from random import random + +import numpy as np +import volumentations as V +import yaml +from torch.utils.data import Dataset + +logger = logging.getLogger(__name__) + + +class LidarDataset(Dataset): + def __init__( + self, + data_dir: Optional[ + Union[str, Tuple[str]] + ] = "data/processed/semantic_kitti", + label_db_filepath: Optional[ + str + ] = "./data/processed/semantic_kitti/label_database.yaml", + mode: Optional[str] = "train", + add_reflection: Optional[bool] = True, + add_distance: Optional[bool] = False, + add_instance: Optional[bool] = True, + num_labels: Optional[int] = -1, + data_percent: Optional[float] = 1.0, + ignore_label: Optional[Union[int, List[int]]] = 255, + volume_augmentations_path: Optional[str] = None, + sweep: Optional[int] = 1, + ): + self.mode = mode + self.data_dir = data_dir + if type(data_dir) == str: + self.data_dir = [self.data_dir] + self.ignore_label = ignore_label + self.add_instance = add_instance + self.add_distance = add_distance + self.add_reflection = add_reflection + + # loading database files + self._data = [] + for database_path in self.data_dir: + database_path = Path(database_path) + if not (database_path / f"{mode}_database.yaml").exists(): + print(f"generate {database_path}/{mode}_database.yaml first") + exit() + self._data.extend( + self._load_yaml(database_path / f"{mode}_database.yaml") + ) + + labels = self._load_yaml(Path(label_db_filepath)) + self._labels = self._select_correct_labels(labels, num_labels) + + # augmentations + self.volume_augmentations = V.NoOp() + if volume_augmentations_path is not None: + self.volume_augmentations = V.load( + volume_augmentations_path, data_format="yaml" + ) + + # reformulating in sweeps + data = [[]] + last_scene = self._data[0]["scene"] + for x in self._data: + if x["scene"] == last_scene: + data[-1].append(x) + else: + last_scene = x["scene"] + data.append([x]) + for i in range(len(data)): + data[i] = list(self.chunks(data[i], sweep)) + self._data = [val for sublist in data for val in sublist] + + if data_percent < 1.0: + self._data = self._data[: int(len(self._data) * data_percent)] + + @staticmethod + def chunks(lst, n): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i : i + n] + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx: int): + points = [] + for sweep in self.data[idx]: + points.append(np.load(sweep["filepath"])) + # rotate + points[-1][:, :3] = ( + points[-1][:, :3] @ np.array(sweep["pose"])[:3, :3] + ) + # translate + points[-1][:, :3] += np.array(sweep["pose"])[:3, 3] + points = np.vstack(points) + + coordinates, features, labels = ( + points[:, :3], + points[:, 3:-2], + points[:, -2:], + ) + + if not self.add_reflection: + features = np.ones(np.ones((len(coordinates), 1))) + + if self.add_distance: + center_coordinate = coordinates.mean(0) + features = np.hstack( + ( + features, + np.linalg.norm(coordinates - center_coordinate, axis=1)[ + :, np.newaxis + ], + ) + ) + + # volume and image augmentations for train + if "train" in self.mode: + coordinates -= coordinates.mean(0) + if 0.5 > random(): + coordinates += ( + np.random.uniform(coordinates.min(0), coordinates.max(0)) + / 2 + ) + aug = self.volume_augmentations( + points=coordinates, + features=features, + labels=labels, + ) + coordinates, features, labels = ( + aug["points"], + aug["features"], + aug["labels"], + ) + + # prepare labels and map from 0 to 20(40) + labels = labels.astype(np.int32) + if labels.size > 0: + labels[:, 0] = self._remap_from_zero(labels[:, 0]) + if not self.add_instance: + # taking only first column, which is segmentation label, not instance + labels = labels[:, 0].flatten() + + return coordinates, features, labels + + @property + def data(self): + """database file containing information about preproscessed dataset""" + return self._data + + @property + def label_info(self): + """database file containing information labels used by dataset""" + return self._labels + + @staticmethod + def _load_yaml(filepath): + with open(filepath) as f: + file = yaml.safe_load(f) + return file + + def _select_correct_labels(self, labels, num_labels): + number_of_validation_labels = 0 + number_of_all_labels = 0 + for ( + k, + v, + ) in labels.items(): + number_of_all_labels += 1 + if v["validation"]: + number_of_validation_labels += 1 + + if num_labels == number_of_all_labels: + return labels + elif num_labels == number_of_validation_labels: + valid_labels = dict() + for ( + k, + v, + ) in labels.items(): + if v["validation"]: + valid_labels.update({k: v}) + return valid_labels + else: + msg = f"""not available number labels, select from: + {number_of_validation_labels}, {number_of_all_labels}""" + raise ValueError(msg) + + def _remap_from_zero(self, labels): + labels[ + ~np.isin(labels, list(self.label_info.keys())) + ] = self.ignore_label + # remap to the range from 0 + for i, k in enumerate(self.label_info.keys()): + labels[labels == k] = i + return labels + + def _remap_model_output(self, output): + output = np.array(output) + output_remapped = output.copy() + for i, k in enumerate(self.label_info.keys()): + output_remapped[output == i] = k + return output_remapped diff --git a/models/Mask3D/mask3d/datasets/preprocessing/__init__.py b/models/Mask3D/mask3d/datasets/preprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/mask3d/datasets/preprocessing/arkitscenes_preprocessing.py b/models/Mask3D/mask3d/datasets/preprocessing/arkitscenes_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..2f222dc27e73eedab1e1d82b14c1573ce632af7c --- /dev/null +++ b/models/Mask3D/mask3d/datasets/preprocessing/arkitscenes_preprocessing.py @@ -0,0 +1,116 @@ +import re +from pathlib import Path +import numpy as np +import pandas as pd +from fire import Fire +from natsort import natsorted +from loguru import logger +import os + +from datasets.preprocessing.base_preprocessing import BasePreprocessing +from utils.point_cloud_utils import load_ply_with_normals + +from datasets.scannet200.scannet200_constants import ( + VALID_CLASS_IDS_200, + SCANNET_COLOR_MAP_200, + CLASS_LABELS_200, +) + + +class ARKitScenesPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "/home/weders/scratch/scratch/scannetter/arkit/raw", + save_dir: str = "/home/weders/scratch/scratch/scannetter/arkit/raw", + modes: tuple = ('Validation', ), + n_jobs: int = 1, + git_repo: str = "./data/raw/scannet/ScanNet", + mesh_file: str="mesh_tsdf.ply", + scannet200: bool = False, + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + self.scannet200 = scannet200 + git_repo = Path(git_repo) + for mode in self.modes: + scenes = os.listdir(os.path.join(data_dir, mode)) + scans_folder = "scans_test" if mode == "test" else "scans" + filepaths = [] + for scene in scenes: + if os.path.exists(os.path.join(data_dir, mode, scene, mesh_file)): + filepaths.append( + self.data_dir + / mode + / scene + / mesh_file) + self.files[mode] = natsorted(filepaths) + + def process_file(self, filepath, mode): + """process_file. + + Please note, that for obtaining segmentation labels ply files were used. + + Args: + filepath: path to the main ply file + mode: train, test or validation + + Returns: + filebase: info about file + """ + scene = int(filepath.parent.name) + print(scene) + filebase = { + "filepath": filepath, + "scene": scene, + "sub_scene": scene, + "raw_filepath": str(filepath), + "file_len": -1, + } + # reading both files and checking that they are fitting + coords, features, _ = load_ply_with_normals(filepath) + file_len = len(coords) + filebase["file_len"] = file_len + points = np.hstack((coords, features)) + + print(features.shape) + + points = np.concatenate((points, np.zeros((file_len, 4))), axis=1) # adding segment and label fake columns + + processed_filepath = ( + self.save_dir / mode / f"data_mask3d.npy" + ) + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + return filebase + + @logger.catch + def fix_bugs_in_labels(self): + if not self.scannet200: + logger.add(self.save_dir / "fixed_bugs_in_labels.log") + found_wrong_labels = { + tuple([270, 0]): 50, + tuple([270, 2]): 50, + tuple([384, 0]): 149, + } + for scene, wrong_label in found_wrong_labels.items(): + scene, sub_scene = scene + bug_file = ( + self.save_dir / "train" / f"{scene:04}_{sub_scene:02}.npy" + ) + points = np.load(bug_file) + bug_mask = points[:, -1] != wrong_label + points = points[bug_mask] + np.save(bug_file, points) + logger.info(f"Fixed {bug_file}") + + def _parse_scene_subscene(self, name): + scene_match = re.match(r"scene(\d{4})_(\d{2})", name) + print(scene_match) + return int(scene_match.group(1)), int(scene_match.group(2)) + + +if __name__ == "__main__": + Fire(ARKitScenesPreprocessing) \ No newline at end of file diff --git a/models/Mask3D/mask3d/datasets/preprocessing/base_preprocessing.py b/models/Mask3D/mask3d/datasets/preprocessing/base_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..a17fd4f89aca0d16d27b1bd10c9f40b3e40a6e61 --- /dev/null +++ b/models/Mask3D/mask3d/datasets/preprocessing/base_preprocessing.py @@ -0,0 +1,204 @@ +import os +import sys +import re +import yaml +import json +import multiprocessing +from pathlib import Path +from hashlib import md5 + +import numpy as np +from fire import Fire +from tqdm import tqdm +from joblib import Parallel, delayed +from loguru import logger + + +class BasePreprocessing: + def __init__( + self, + data_dir: str = "./data/raw/", + save_dir: str = "./data/processed/", + modes: tuple = ("train", "validation", "test"), + n_jobs: int = -1, + ): + self.data_dir = Path(data_dir) + self.save_dir = Path(save_dir) + self.n_jobs = n_jobs + self.modes = modes + + if not self.data_dir.exists(): + logger.error("data folder doesn't exist") + raise FileNotFoundError + if self.save_dir.exists() is False: + self.save_dir.mkdir(parents=True, exist_ok=True) + + self.files = {} + for data_type in self.modes: + self.files.update({data_type: []}) + + @logger.catch + def preprocess(self): + self.n_jobs = ( + multiprocessing.cpu_count() if self.n_jobs == -1 else self.n_jobs + ) + for mode in self.modes: + database = [] + logger.info(f"Tasks for {mode}: {len(self.files[mode])}") + parallel_results = Parallel(n_jobs=self.n_jobs, verbose=10)( + delayed(self.process_file)(file, mode) + for file in self.files[mode] + ) + for filebase in parallel_results: + database.append(filebase) + self.save_database(database, mode) + # self.fix_bugs_in_labels() + # self.joint_database() + # self.compute_color_mean_std( + # train_database_path=(self.save_dir / "train_database.yaml") + # ) + + def preprocess_sequential(self): + for mode in self.modes: + database = [] + for filepath in tqdm(self.files[mode], unit="file"): + filebase = self.process_file(filepath, mode) + database.append(filebase) + self.save_database(database, mode) + self.fix_bugs_in_labels() + self.joint_database() + self.compute_color_mean_std( + train_database_path=(self.save_dir / "train_database.yaml") + ) + + def process_file(self, filepath, mode): + """process_file. + + Args: + filepath: path to the main file + mode: typically train, test or validation + + Returns: + filebase: info about file + """ + raise NotImplementedError + + def make_instance_database_sequential( + self, + train_database_path: str = "./data/processed/train_database.yaml", + mode="instance", + ): + train_database = self._load_yaml(train_database_path) + instance_database = [] + for sample in tqdm(train_database): + instance_database.append(self.extract_instance_from_file(sample)) + self.save_database(instance_database, mode=mode) + + @logger.catch + def make_instance_database( + self, + train_database_path: str = "./data/processed/train_database.yaml", + mode="instance", + ): + self.n_jobs = ( + multiprocessing.cpu_count() if self.n_jobs == -1 else self.n_jobs + ) + train_database = self._load_yaml(train_database_path) + instance_database = [] + logger.info(f"Files in database: {len(train_database)}") + parallel_results = Parallel(n_jobs=self.n_jobs, verbose=10)( + delayed(self.extract_instance_from_file)(sample) + for sample in train_database + ) + for filebase in parallel_results: + instance_database.append(filebase) + self.save_database(instance_database, mode=mode) + + def extract_instance_from_file(self, sample_from_database): + points = np.load(sample_from_database["filepath"]) + labels = points[:, -2:] + file_instances = [] + for instance_id in np.unique(labels[:, 1]): + occupied_indices = np.isin(labels[:, 1], instance_id) + instance_points = points[occupied_indices].copy() + instance_classes = ( + np.unique(instance_points[:, 9]).astype(int).tolist() + ) + + hash_string = str(sample_from_database["filepath"]) + str( + instance_id + ) + hash_string = md5(hash_string.encode("utf-8")).hexdigest() + instance_filepath = ( + self.save_dir / "instances" / f"{hash_string}.npy" + ) + instance = { + "classes": instance_classes, + "instance_filepath": str(instance_filepath), + "instance_size": len(instance_points), + "original_file": str(sample_from_database["filepath"]), + } + if not instance_filepath.parent.exists(): + instance_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(instance_filepath, instance_points.astype(np.float32)) + file_instances.append(instance) + return file_instances + + def fix_bugs_in_labels(self): + pass + + def compute_color_mean_std( + self, + train_database_path: str = "./data/processed/train_database.yaml", + ): + pass + + def save_database(self, database, mode): + for element in database: + self._dict_to_yaml(element) + self._save_yaml(self.save_dir / (mode + "_database.yaml"), database) + + def joint_database(self, train_modes=["train", "validation"]): + joint_db = [] + for mode in train_modes: + joint_db.extend( + self._load_yaml(self.save_dir / (mode + "_database.yaml")) + ) + self._save_yaml( + self.save_dir / "train_validation_database.yaml", joint_db + ) + + @classmethod + def _read_json(cls, path): + with open(path) as f: + file = json.load(f) + return file + + @classmethod + def _save_yaml(cls, path, file): + with open(path, "w") as f: + yaml.safe_dump( + file, f, default_style=None, default_flow_style=False + ) + + @classmethod + def _dict_to_yaml(cls, dictionary): + if not isinstance(dictionary, dict): + return + for k, v in dictionary.items(): + if isinstance(v, dict): + cls._dict_to_yaml(v) + if isinstance(v, np.ndarray): + dictionary[k] = v.tolist() + if isinstance(v, Path): + dictionary[k] = str(v) + + @classmethod + def _load_yaml(cls, filepath): + with open(filepath) as f: + file = yaml.safe_load(f) + return file + + +if __name__ == "__main__": + Fire(BasePreprocessing) diff --git a/models/Mask3D/mask3d/datasets/preprocessing/s3dis_preprocessing.py b/models/Mask3D/mask3d/datasets/preprocessing/s3dis_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..3e7ff4967ca9dc22248c6863b41f7b652687ae98 --- /dev/null +++ b/models/Mask3D/mask3d/datasets/preprocessing/s3dis_preprocessing.py @@ -0,0 +1,282 @@ +import os +import re + +import numpy as np +from fire import Fire +from loguru import logger +from natsort import natsorted + +from datasets.preprocessing.base_preprocessing import BasePreprocessing + + +class S3DISPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "./data/raw/s3dis", + save_dir: str = "./data/processed/s3dis", + modes: tuple = ( + "Area_1", + "Area_2", + "Area_3", + "Area_4", + "Area_5", + "Area_6", + ), + n_jobs: int = -1, + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + self.class_map = { + "ceiling": 0, + "floor": 1, + "wall": 2, + "beam": 3, + "column": 4, + "window": 5, + "door": 6, + "table": 7, + "chair": 8, + "sofa": 9, + "bookcase": 10, + "board": 11, + "clutter": 12, + "stairs": 12, # stairs are also mapped to clutter + } + + self.color_map = [ + [0, 255, 0], # ceiling + [0, 0, 255], # floor + [0, 255, 255], # wall + [255, 255, 0], # beam + [255, 0, 255], # column + [100, 100, 255], # window + [200, 200, 100], # door + [170, 120, 200], # table + [255, 0, 0], # chair + [200, 100, 100], # sofa + [10, 200, 100], # bookcase + [200, 200, 200], # board + [50, 50, 50], + ] # clutter + + self.create_label_database() + + for mode in self.modes: + filepaths = [] + for scene_path in [ + f.path for f in os.scandir(self.data_dir / mode) if f.is_dir() + ]: + filepaths.append(scene_path) + self.files[mode] = natsorted(filepaths) + + def create_label_database(self): + label_database = dict() + for class_name, class_id in self.class_map.items(): + label_database[class_id] = { + "color": self.color_map[class_id], + "name": class_name, + "validation": True, + } + + self._save_yaml(self.save_dir / "label_database.yaml", label_database) + return label_database + + def _buf_count_newlines_gen(self, fname): + def _make_gen(reader): + while True: + b = reader(2**16) + if not b: + break + yield b + + with open(fname, "rb") as f: + count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read)) + return count + + def process_file(self, filepath, mode): + """process_file. + + Please note, that for obtaining segmentation labels ply files were used. + + Args: + filepath: path to the main ply file + mode: train, test or validation + + Returns: + filebase: info about file + """ + filebase = { + "filepath": filepath, + "scene": filepath.split("/")[-1], + "area": mode, + "raw_filepath": str(filepath), + "file_len": -1, + } + + scene_name = filepath.split("/")[-1] + instance_counter = 0 + scene_points = [] + for instance in [ + f + for f in os.scandir( + self.data_dir / mode / scene_name / "Annotations" + ) + if f.name.endswith(".txt") + ]: + instance_class = self.class_map[instance.name.split("_")[0]] + instance_points = np.loadtxt(instance.path) + + instance_normals = np.ones((instance_points.shape[0], 3)) + instance_class = np.array(instance_class).repeat( + instance_points.shape[0] + )[..., None] + instance_id = np.array(instance_counter).repeat( + instance_points.shape[0] + )[..., None] + + instance_points = np.hstack( + ( + instance_points, + instance_normals, + instance_class, + instance_id, + ) + ) + + scene_points.append(instance_points) + instance_counter += 1 + + points = np.vstack(scene_points) + + pcd_size = self._buf_count_newlines_gen(f"{filepath}/{scene_name}.txt") + if points.shape[0] != pcd_size: + print(f"FILE SIZE DOES NOT MATCH FOR {filepath}/{scene_name}.txt") + print(f"({points.shape[0]} vs. {pcd_size})") + + filebase["raw_segmentation_filepath"] = "" + + # add segment id as additional feature (DUMMY) + points = np.hstack((points, np.ones(points.shape[0])[..., None])) + points[:, [9, 10, -1]] = points[ + :, [-1, 9, 10] + ] # move segments after RGB + + gt_data = (points[:, -2] + 1) * 1000 + points[:, -1] + 1 + + file_len = len(points) + filebase["file_len"] = file_len + + processed_filepath = self.save_dir / mode / f"{scene_name}.npy" + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + processed_gt_filepath = ( + self.save_dir / "instance_gt" / mode / f"{scene_name}.txt" + ) + if not processed_gt_filepath.parent.exists(): + processed_gt_filepath.parent.mkdir(parents=True, exist_ok=True) + np.savetxt(processed_gt_filepath, gt_data.astype(np.int32), fmt="%d") + filebase["instance_gt_filepath"] = str(processed_gt_filepath) + + filebase["color_mean"] = [ + float((points[:, 3] / 255).mean()), + float((points[:, 4] / 255).mean()), + float((points[:, 5] / 255).mean()), + ] + filebase["color_std"] = [ + float(((points[:, 3] / 255) ** 2).mean()), + float(((points[:, 4] / 255) ** 2).mean()), + float(((points[:, 5] / 255) ** 2).mean()), + ] + return filebase + + def compute_color_mean_std(self, train_database_path: str = ""): + area_database_paths = [ + f + for f in os.scandir(self.save_dir) + if f.name.startswith("Area_") and f.name.endswith(".yaml") + ] + + for database_path in area_database_paths: + database = self._load_yaml(database_path.path) + color_mean, color_std = [], [] + for sample in database: + color_std.append(sample["color_std"]) + color_mean.append(sample["color_mean"]) + + color_mean = np.array(color_mean).mean(axis=0) + color_std = np.sqrt( + np.array(color_std).mean(axis=0) - color_mean**2 + ) + feats_mean_std = { + "mean": [float(each) for each in color_mean], + "std": [float(each) for each in color_std], + } + self._save_yaml( + self.save_dir / f"{database_path.name}_color_mean_std.yaml", + feats_mean_std, + ) + + for database_path in area_database_paths: + all_mean, all_std = [], [] + for let_out_path in area_database_paths: + if database_path == let_out_path: + continue + + database = self._load_yaml(let_out_path.path) + for sample in database: + all_std.append(sample["color_std"]) + all_mean.append(sample["color_mean"]) + + all_color_mean = np.array(all_mean).mean(axis=0) + all_color_std = np.sqrt( + np.array(all_std).mean(axis=0) - all_color_mean**2 + ) + feats_mean_std = { + "mean": [float(each) for each in all_color_mean], + "std": [float(each) for each in all_color_std], + } + file_path = database_path.name.replace("_database.yaml", "") + self._save_yaml( + self.save_dir / f"{file_path}_color_mean_std.yaml", + feats_mean_std, + ) + + @logger.catch + def fix_bugs_in_labels(self): + pass + + def joint_database( + self, + train_modes=( + "Area_1", + "Area_2", + "Area_3", + "Area_4", + "Area_5", + "Area_6", + ), + ): + for mode in train_modes: + joint_db = [] + for let_out in train_modes: + if mode == let_out: + continue + joint_db.extend( + self._load_yaml( + self.save_dir / (let_out + "_database.yaml") + ) + ) + self._save_yaml( + self.save_dir / f"train_{mode}_database.yaml", joint_db + ) + + def _parse_scene_subscene(self, name): + scene_match = re.match(r"scene(\d{4})_(\d{2})", name) + return int(scene_match.group(1)), int(scene_match.group(2)) + + +if __name__ == "__main__": + Fire(S3DISPreprocessing) diff --git a/models/Mask3D/mask3d/datasets/preprocessing/scannet_preprocessing.py b/models/Mask3D/mask3d/datasets/preprocessing/scannet_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..5a981864612e04930b04c9c0df8aaa6e2d9249a3 --- /dev/null +++ b/models/Mask3D/mask3d/datasets/preprocessing/scannet_preprocessing.py @@ -0,0 +1,296 @@ +import re +from pathlib import Path +import numpy as np +import pandas as pd +from fire import Fire +from natsort import natsorted +from loguru import logger + +from datasets.preprocessing.base_preprocessing import BasePreprocessing +from utils.point_cloud_utils import load_ply_with_normals + +from datasets.scannet200.scannet200_constants import ( + VALID_CLASS_IDS_200, + SCANNET_COLOR_MAP_200, + CLASS_LABELS_200, +) + + +class ScannetPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "./data/raw/scannet/scannet", + save_dir: str = "./data/processed/scannet", + modes: tuple = ("train", "validation", "test"), + n_jobs: int = -1, + git_repo: str = "./data/raw/scannet/ScanNet", + scannet200: bool = False, + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + self.scannet200 = scannet200 + + if self.scannet200: + self.labels_pd = pd.read_csv( + self.data_dir / "scannetv2-labels.combined.tsv", + sep="\t", + header=0, + ) + + git_repo = Path(git_repo) + self.create_label_database(git_repo) + for mode in self.modes: + trainval_split_dir = git_repo / "Tasks" / "Benchmark" + scannet_special_mode = "val" if mode == "validation" else mode + with open( + trainval_split_dir / (f"scannetv2_{scannet_special_mode}.txt") + ) as f: + # -1 because the last one is always empty + split_file = f.read().split("\n")[:-1] + + scans_folder = "scans_test" if mode == "test" else "scans" + filepaths = [] + for scene in split_file: + filepaths.append( + self.data_dir + / scans_folder + / scene + / (scene + "_vh_clean_2.ply") + ) + self.files[mode] = natsorted(filepaths) + + def create_label_database(self, git_repo): + if self.scannet200: + label_database = {} + for row_id, class_id in enumerate(VALID_CLASS_IDS_200): + label_database[class_id] = { + "color": SCANNET_COLOR_MAP_200[class_id], + "name": CLASS_LABELS_200[row_id], + "validation": True, + } + self._save_yaml( + self.save_dir / "label_database.yaml", label_database + ) + return label_database + else: + if (self.save_dir / "label_database.yaml").exists(): + return self._load_yaml(self.save_dir / "label_database.yaml") + df = pd.read_csv( + self.data_dir / "scannetv2-labels.combined.tsv", sep="\t" + ) + df = ( + df[~df[["nyu40class", "nyu40id"]].duplicated()][ + ["nyu40class", "nyu40id"] + ] + .set_index("nyu40id") + .sort_index()[["nyu40class"]] + .rename(columns={"nyu40class": "name"}) + .replace(" ", "_", regex=True) + ) + df = pd.DataFrame([{"name": "empty"}]).append(df) + df["validation"] = False + + with open( + git_repo + / "Tasks" + / "Benchmark" + / "classes_SemVoxLabel-nyu40id.txt" + ) as f: + for_validation = f.read().split("\n") + for category in for_validation: + index = int(re.split(" +", category)[0]) + df.loc[index, "validation"] = True + + # doing this hack because otherwise I will have to install imageio + with open(git_repo / "BenchmarkScripts" / "util.py") as f: + util = f.read() + color_list = eval("[" + util.split("return [\n")[1]) + + df["color"] = color_list + + label_database = df.to_dict("index") + self._save_yaml( + self.save_dir / "label_database.yaml", label_database + ) + return label_database + + def process_file(self, filepath, mode): + """process_file. + + Please note, that for obtaining segmentation labels ply files were used. + + Args: + filepath: path to the main ply file + mode: train, test or validation + + Returns: + filebase: info about file + """ + scene, sub_scene = self._parse_scene_subscene(filepath.name) + filebase = { + "filepath": filepath, + "scene": scene, + "sub_scene": sub_scene, + "raw_filepath": str(filepath), + "file_len": -1, + } + # reading both files and checking that they are fitting + coords, features, _ = load_ply_with_normals(filepath) + file_len = len(coords) + filebase["file_len"] = file_len + points = np.hstack((coords, features)) + + if mode in ["train", "validation"]: + # getting scene information + description_filepath = Path( + filepath + ).parent / filepath.name.replace("_vh_clean_2.ply", ".txt") + with open(description_filepath) as f: + scene_type = f.read().split("\n")[:-1] + scene_type = scene_type[-1].split(" = ")[1] + filebase["scene_type"] = scene_type + filebase["raw_description_filepath"] = description_filepath + + # getting instance info + instance_info_filepath = next( + Path(filepath).parent.glob("*.aggregation.json") + ) + segment_indexes_filepath = next( + Path(filepath).parent.glob("*[0-9].segs.json") + ) + instance_db = self._read_json(instance_info_filepath) + segments = self._read_json(segment_indexes_filepath) + segments = np.array(segments["segIndices"]) + filebase["raw_instance_filepath"] = instance_info_filepath + filebase["raw_segmentation_filepath"] = segment_indexes_filepath + + # add segment id as additional feature + segment_ids = np.unique(segments, return_inverse=True)[1] + points = np.hstack((points, segment_ids[..., None])) + + # reading labels file + label_filepath = filepath.parent / filepath.name.replace( + ".ply", ".labels.ply" + ) + filebase["raw_label_filepath"] = label_filepath + label_coords, label_colors, labels = load_ply_with_normals( + label_filepath + ) + if not np.allclose(coords, label_coords): + raise ValueError("files doesn't have same coordinates") + + # adding instance label + labels = labels[:, np.newaxis] + empty_instance_label = np.full(labels.shape, -1) + labels = np.hstack((labels, empty_instance_label)) + for instance in instance_db["segGroups"]: + segments_occupied = np.array(instance["segments"]) + occupied_indices = np.isin(segments, segments_occupied) + labels[occupied_indices, 1] = instance["id"] + + if self.scannet200: + label200 = instance["label"] + # Map the category name to id + label_ids = self.labels_pd[ + self.labels_pd["raw_category"] == label200 + ]["id"] + label_id = ( + int(label_ids.iloc[0]) if len(label_ids) > 0 else 0 + ) + labels[occupied_indices, 0] = label_id + points = np.hstack((points, labels)) + + # gt_data = (points[:, -2] + 1) * 1000 + points[:, -1] + 1 + gt_data = points[:, -2] * 1000 + points[:, -1] + 1 + else: + segments_test = "../../data/raw/scannet_test_segments" + segment_indexes_filepath = filepath.name.replace( + ".ply", ".0.010000.segs.json" + ) + segments = self._read_json( + f"{segments_test}/{segment_indexes_filepath}" + ) + segments = np.array(segments["segIndices"]) + # add segment id as additional feature + segment_ids = np.unique(segments, return_inverse=True)[1] + points = np.hstack((points, segment_ids[..., None])) + + processed_filepath = ( + self.save_dir / mode / f"{scene:04}_{sub_scene:02}.npy" + ) + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + if mode == "test": + return filebase + + processed_gt_filepath = ( + self.save_dir + / "instance_gt" + / mode + / f"scene{scene:04}_{sub_scene:02}.txt" + ) + if not processed_gt_filepath.parent.exists(): + processed_gt_filepath.parent.mkdir(parents=True, exist_ok=True) + np.savetxt(processed_gt_filepath, gt_data.astype(np.int32), fmt="%d") + filebase["instance_gt_filepath"] = str(processed_gt_filepath) + + filebase["color_mean"] = [ + float((features[:, 0] / 255).mean()), + float((features[:, 1] / 255).mean()), + float((features[:, 2] / 255).mean()), + ] + filebase["color_std"] = [ + float(((features[:, 0] / 255) ** 2).mean()), + float(((features[:, 1] / 255) ** 2).mean()), + float(((features[:, 2] / 255) ** 2).mean()), + ] + return filebase + + def compute_color_mean_std( + self, + train_database_path: str = "./data/processed/scannet/train_database.yaml", + ): + train_database = self._load_yaml(train_database_path) + color_mean, color_std = [], [] + for sample in train_database: + color_std.append(sample["color_std"]) + color_mean.append(sample["color_mean"]) + + color_mean = np.array(color_mean).mean(axis=0) + color_std = np.sqrt(np.array(color_std).mean(axis=0) - color_mean**2) + feats_mean_std = { + "mean": [float(each) for each in color_mean], + "std": [float(each) for each in color_std], + } + self._save_yaml(self.save_dir / "color_mean_std.yaml", feats_mean_std) + + @logger.catch + def fix_bugs_in_labels(self): + if not self.scannet200: + logger.add(self.save_dir / "fixed_bugs_in_labels.log") + found_wrong_labels = { + tuple([270, 0]): 50, + tuple([270, 2]): 50, + tuple([384, 0]): 149, + } + for scene, wrong_label in found_wrong_labels.items(): + scene, sub_scene = scene + bug_file = ( + self.save_dir / "train" / f"{scene:04}_{sub_scene:02}.npy" + ) + points = np.load(bug_file) + bug_mask = points[:, -1] != wrong_label + points = points[bug_mask] + np.save(bug_file, points) + logger.info(f"Fixed {bug_file}") + + def _parse_scene_subscene(self, name): + scene_match = re.match(r"scene(\d{4})_(\d{2})", name) + return int(scene_match.group(1)), int(scene_match.group(2)) + + +if __name__ == "__main__": + Fire(ScannetPreprocessing) diff --git a/models/Mask3D/mask3d/datasets/preprocessing/semantic_kitti_preprocessing.py b/models/Mask3D/mask3d/datasets/preprocessing/semantic_kitti_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..d483e535435cca026588c3177cfe368fad99596b --- /dev/null +++ b/models/Mask3D/mask3d/datasets/preprocessing/semantic_kitti_preprocessing.py @@ -0,0 +1,181 @@ +import re +from pathlib import Path +from hashlib import md5 +from natsort import natsorted + +import numpy as np +from fire import Fire + +from base_preprocessing import BasePreprocessing + + +class SemanticKittiPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "./data/raw/semantic_kitti", + save_dir: str = "./data/processed/semantic_kitti", + modes: tuple = ("train", "validation", "test"), + n_jobs: int = -1, + git_repo: str = "./data/raw/semantic-kitti-api", + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + git_repo = Path(git_repo) + self.create_label_database(git_repo / "config" / "semantic-kitti.yaml") + self.config = self._load_yaml( + git_repo / "config" / "semantic-kitti.yaml" + ) + self.pose = dict() + + for mode in self.modes: + scene_mode = "valid" if mode == "validation" else mode + self.pose[mode] = dict() + for scene in sorted(self.config["split"][scene_mode]): + filepaths = list( + self.data_dir.glob(f"*/{scene:02}/velodyne/*bin") + ) + filepaths = [str(file) for file in filepaths] + self.files[mode].extend(natsorted(filepaths)) + calibration = parse_calibration( + Path(filepaths[0]).parent.parent / "calib.txt" + ) + self.pose[mode].update( + { + scene: parse_poses( + Path(filepaths[0]).parent.parent / "poses.txt", + calibration, + ), + } + ) + + def create_label_database(self, config_file): + if (self.save_dir / "label_database.yaml").exists(): + return self._load_yaml(self.save_dir / "label_database.yaml") + config = self._load_yaml(config_file) + label_database = {} + for key, old_key in config["learning_map_inv"].items(): + label_database.update( + { + key: { + "name": config["labels"][old_key], + # bgr -> rgb + "color": config["color_map"][old_key][::-1], + "validation": not config["learning_ignore"][key], + } + } + ) + + self._save_yaml(self.save_dir / "label_database.yaml", label_database) + return label_database + + def process_file(self, filepath, mode): + """process_file. + + Args: + filepath: path to the main ply file + mode: train, test + + Returns: + filebase: info about file + """ + scene, sub_scene = re.search(r"(\d{2}).*(\d{6})", filepath).group(1, 2) + filebase = { + "filepath": filepath, + "scene": int(scene), + "sub_scene": int(sub_scene), + "file_len": -1, + "pose": self.pose[mode][int(scene)][int(sub_scene)].tolist(), + } + + points = np.fromfile(filepath, dtype=np.float32).reshape(-1, 4) + file_len = len(points) + filebase["file_len"] = file_len + + if mode in ["train", "validation"]: + # getting label info + label_filepath = filepath.replace("velodyne", "labels").replace( + "bin", "label" + ) + filebase["label_filepath"] = label_filepath + label = np.fromfile(label_filepath, dtype=np.uint32).astype( + np.int32 + ) + if not points.shape[0] == label.shape[0]: + raise ValueError("Files do not have same length") + semantic_label = label & 0xFFFF + instance_label = label >> 16 + + semantic_label_copy = semantic_label.copy() + for label in np.unique(semantic_label): + semantic_label[semantic_label_copy == label] = self.config[ + "learning_map" + ][label] + + label = np.hstack( + (semantic_label[:, np.newaxis], instance_label[:, np.newaxis]) + ) + points = np.hstack((points, label)) + + processed_filepath = self.save_dir / mode / f"{scene}_{sub_scene}.npy" + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + return filebase + + +def parse_calibration(filename): + """read calibration file with given filename + Returns + ------- + dict + Calibration matrices as 4x4 numpy arrays. + """ + calib = {} + + with open(filename) as calib_file: + for line in calib_file: + key, content = line.strip().split(":") + values = [float(v) for v in content.strip().split()] + + pose = np.zeros((4, 4)) + pose[0, 0:4] = values[0:4] + pose[1, 0:4] = values[4:8] + pose[2, 0:4] = values[8:12] + pose[3, 3] = 1.0 + + calib[key] = pose + return calib + + +def parse_poses(filename, calibration): + """read poses file with per-scan poses from given filename + Returns + ------- + list + list of poses as 4x4 numpy arrays. + """ + + poses = [] + + Tr = calibration["Tr"] + Tr_inv = np.linalg.inv(Tr) + + with open(filename) as file: + for line in file: + values = [float(v) for v in line.strip().split()] + + pose = np.zeros((4, 4)) + pose[0, 0:4] = values[0:4] + pose[1, 0:4] = values[4:8] + pose[2, 0:4] = values[8:12] + pose[3, 3] = 1.0 + + poses.append(np.matmul(Tr_inv, np.matmul(pose, Tr))) + + return poses + + +if __name__ == "__main__": + Fire(SemanticKittiPreprocessing) diff --git a/models/Mask3D/mask3d/datasets/preprocessing/stpls3d_preprocessing.py b/models/Mask3D/mask3d/datasets/preprocessing/stpls3d_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..63ed5bff5d52e656f4bad2f853e5973b433871bd --- /dev/null +++ b/models/Mask3D/mask3d/datasets/preprocessing/stpls3d_preprocessing.py @@ -0,0 +1,291 @@ +import re +import os +import numpy as np +from fire import Fire +from natsort import natsorted +from loguru import logger +import pandas as pd + +from datasets.preprocessing.base_preprocessing import BasePreprocessing + + +class STPLS3DPreprocessing(BasePreprocessing): + def __init__( + self, + data_dir: str = "../../data/raw/stpls3d", + save_dir: str = "../../data/processed/stpls3d", + modes: tuple = ("train", "validation", "test"), + n_jobs: int = -1, + ): + super().__init__(data_dir, save_dir, modes, n_jobs) + + # https://github.com/meidachen/STPLS3D/blob/main/HAIS/STPLS3DInstanceSegmentationChallenge_Codalab_Evaluate.py#L31 + CLASS_LABELS = [ + "Build", + "LowVeg", + "MediumVeg", + "HighVeg", + "Vehicle", + "Truck", + "Aircraft", + "MilitaryVeh", + "Bike", + "Motorcycle", + "LightPole", + "StreetSign", + "Clutter", + "Fence", + ] + VALID_CLASS_IDS = np.array( + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] + ) + + self.class_map = { + "Ground": 0, + "Build": 1, + "LowVeg": 2, + "MediumVeg": 3, + "HighVeg": 4, + "Vehicle": 5, + "Truck": 6, + "Aircraft": 7, + "MilitaryVeh": 8, + "Bike": 9, + "Motorcycle": 10, + "LightPole": 11, + "StreetSign": 12, + "Clutter": 13, + "Fence": 14, + } + + self.color_map = [ + [0, 255, 0], # Ground + [0, 0, 255], # Build + [0, 255, 255], # LowVeg + [255, 255, 0], # MediumVeg + [255, 0, 255], # HiVeg + [100, 100, 255], # Vehicle + [200, 200, 100], # Truck + [170, 120, 200], # Aircraft + [255, 0, 0], # MilitaryVec + [200, 100, 100], # Bike + [10, 200, 100], # Motorcycle + [200, 200, 200], # LightPole + [50, 50, 50], # StreetSign + [60, 130, 60], # Clutter + [130, 30, 60], + ] # Fence + + self.create_label_database() + + for mode in self.modes: + filepaths = [] + for scene_path in [ + f.path for f in os.scandir(self.data_dir / mode) + ]: + filepaths.append(scene_path) + self.files[mode] = natsorted(filepaths) + + def create_label_database(self): + label_database = dict() + for class_name, class_id in self.class_map.items(): + label_database[class_id] = { + "color": self.color_map[class_id], + "name": class_name, + "validation": True, + } + + self._save_yaml(self.save_dir / "label_database.yaml", label_database) + return label_database + + def process_file(self, filepath, mode): + """process_file. + + Please note, that for obtaining segmentation labels ply files were used. + + Args: + filepath: path to the main ply file + mode: train, test or validation + + Returns: + filebase: info about file + """ + filebase = { + "filepath": filepath, + "scene": filepath.split("/")[-1], + "raw_filepath": str(filepath), + "file_len": -1, + } + + points = pd.read_csv(filepath, header=None).values + + filebase["raw_segmentation_filepath"] = "" + + # add segment id as additional feature (DUMMY) + if mode in ["train", "validation"]: + points = np.hstack( + ( + points, + np.ones(points.shape[0])[..., None], # normal 1 + np.ones(points.shape[0])[..., None], # normal 2 + np.ones(points.shape[0])[..., None], # normal 3 + np.ones(points.shape[0])[..., None], + ) + ) # segments + else: + # we need to add dummies for semantics and instances + points = np.hstack( + ( + points, + np.ones(points.shape[0])[..., None], # semantic class + np.ones(points.shape[0])[..., None], # instance id + np.ones(points.shape[0])[..., None], # normal 1 + np.ones(points.shape[0])[..., None], # normal 2 + np.ones(points.shape[0])[..., None], # normal 3 + np.ones(points.shape[0])[..., None], + ) + ) # segments + + points = points[ + :, [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 6, 7] + ] # move segments after RGB + + # move point clouds to be in positive range (important for split pointcloud function) + points[:, :3] = points[:, :3] - points[:, :3].min(0) + + points = points.astype(np.float32) + + if mode == "test": + points = points[:, :-2] + else: + points[ + points[:, -1] == -100.0, -1 + ] = -1 # -1 indicates "no instance" + + file_len = len(points) + filebase["file_len"] = file_len + + processed_filepath = ( + self.save_dir + / mode + / f"{filebase['scene'].replace('.txt', '')}.npy" + ) + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir(parents=True, exist_ok=True) + np.save(processed_filepath, points.astype(np.float32)) + filebase["filepath"] = str(processed_filepath) + + if mode in ["validation", "test"]: + blocks = self.splitPointCloud(points) + + filebase["instance_gt_filepath"] = [] + filebase["filepath_crop"] = [] + for block_id, block in enumerate(blocks): + if len(block) > 10000: + if mode == "validation": + new_instance_ids = np.unique( + block[:, -1], return_inverse=True + )[1] + + assert new_instance_ids.shape[0] == block.shape[0] + # == 0 means -1 == no instance + # new_instance_ids[new_instance_ids == 0] + assert ( + new_instance_ids.max() < 1000 + ), "we cannot encode when there are more than 999 instances in a block" + + gt_data = (block[:, -2]) * 1000 + new_instance_ids + + processed_gt_filepath = ( + self.save_dir + / "instance_gt" + / mode + / f"{filebase['scene'].replace('.txt', '')}_{block_id}.txt" + ) + if not processed_gt_filepath.parent.exists(): + processed_gt_filepath.parent.mkdir( + parents=True, exist_ok=True + ) + np.savetxt( + processed_gt_filepath, + gt_data.astype(np.int32), + fmt="%d", + ) + filebase["instance_gt_filepath"].append( + str(processed_gt_filepath) + ) + + processed_filepath = ( + self.save_dir + / mode + / f"{filebase['scene'].replace('.txt', '')}_{block_id}.npy" + ) + if not processed_filepath.parent.exists(): + processed_filepath.parent.mkdir( + parents=True, exist_ok=True + ) + np.save(processed_filepath, block.astype(np.float32)) + filebase["filepath_crop"].append(str(processed_filepath)) + else: + print("block was smaller than 1000 points") + assert False + + filebase["color_mean"] = [ + float((points[:, 3] / 255).mean()), + float((points[:, 4] / 255).mean()), + float((points[:, 5] / 255).mean()), + ] + filebase["color_std"] = [ + float(((points[:, 3] / 255) ** 2).mean()), + float(((points[:, 4] / 255) ** 2).mean()), + float(((points[:, 5] / 255) ** 2).mean()), + ] + return filebase + + def compute_color_mean_std( + self, + train_database_path: str = "./data/processed/stpls3d/train_database.yaml", + ): + train_database = self._load_yaml(train_database_path) + color_mean, color_std = [], [] + for sample in train_database: + color_std.append(sample["color_std"]) + color_mean.append(sample["color_mean"]) + + color_mean = np.array(color_mean).mean(axis=0) + color_std = np.sqrt(np.array(color_std).mean(axis=0) - color_mean**2) + feats_mean_std = { + "mean": [float(each) for each in color_mean], + "std": [float(each) for each in color_std], + } + self._save_yaml(self.save_dir / "color_mean_std.yaml", feats_mean_std) + + def splitPointCloud(self, cloud, size=50.0, stride=50): + limitMax = np.amax(cloud[:, 0:3], axis=0) + width = int(np.ceil((limitMax[0] - size) / stride)) + 1 + depth = int(np.ceil((limitMax[1] - size) / stride)) + 1 + cells = [ + (x * stride, y * stride) + for x in range(width) + for y in range(depth) + ] + blocks = [] + for (x, y) in cells: + xcond = (cloud[:, 0] <= x + size) & (cloud[:, 0] >= x) + ycond = (cloud[:, 1] <= y + size) & (cloud[:, 1] >= y) + cond = xcond & ycond + block = cloud[cond, :] + blocks.append(block) + return blocks + + @logger.catch + def fix_bugs_in_labels(self): + pass + + def _parse_scene_subscene(self, name): + scene_match = re.match(r"scene(\d{4})_(\d{2})", name) + return int(scene_match.group(1)), int(scene_match.group(2)) + + +if __name__ == "__main__": + Fire(STPLS3DPreprocessing) diff --git a/models/Mask3D/mask3d/datasets/random_cuboid.py b/models/Mask3D/mask3d/datasets/random_cuboid.py new file mode 100644 index 0000000000000000000000000000000000000000..334b87ecadbd9cbee2979d462532fb4a479b280f --- /dev/null +++ b/models/Mask3D/mask3d/datasets/random_cuboid.py @@ -0,0 +1,96 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import numpy as np +import torch + + +def check_aspect(crop_range, aspect_min): + xy_aspect = np.min(crop_range[:2]) / np.max(crop_range[:2]) + xz_aspect = np.min(crop_range[[0, 2]]) / np.max(crop_range[[0, 2]]) + yz_aspect = np.min(crop_range[1:]) / np.max(crop_range[1:]) + return ( + (xy_aspect >= aspect_min) + or (xz_aspect >= aspect_min) + or (yz_aspect >= aspect_min) + ) + + +class RandomCuboid(object): + """ + RandomCuboid augmentation from DepthContrast [https://arxiv.org/abs/2101.02691] + We slightly modify this operation to account for object detection. + This augmentation randomly crops a cuboid from the input and + ensures that the cropped cuboid contains at least one bounding box + """ + + def __init__( + self, + min_points, + # aspect=0.8, + crop_length=6.0, + version1=True, + ): + # self.aspect = aspect + self.crop_length = crop_length + self.min_points = min_points + self.version1 = version1 + + def __call__(self, point_cloud): + if point_cloud.shape[0] < self.min_points: + print("too small pcd") + return np.ones(point_cloud.shape[0], dtype=np.bool) + + range_xyz = np.max(point_cloud[:, :2], axis=0) - np.min( + point_cloud[:, :2], axis=0 + ) + + for _ in range(100): + # crop_range = self.min_crop + np.random.rand(3) * ( + # self.max_crop - self.min_crop + # ) + # crop_range[-1] = 999. + # if not check_aspect(crop_range, self.aspect): + # continue + + sample_center = point_cloud[:, :2].min(axis=0) + range_xyz / 2 + + if self.version1: + offset_x = np.random.uniform( + -range_xyz[0] / 4, range_xyz[0] / 4 + ) + offset_y = np.random.uniform( + -range_xyz[1] / 4, range_xyz[1] / 4 + ) + else: + offset_x = np.random.uniform( + -(range_xyz[0] / 2) + self.crop_length / 4, + +(range_xyz[0] / 2) - self.crop_length / 4, + ) + offset_y = np.random.uniform( + -(range_xyz[1] / 2) + self.crop_length / 4, + +(range_xyz[1] / 2) - self.crop_length / 4, + ) + + sample_center[0] = sample_center[0] + offset_x + sample_center[1] = sample_center[1] + offset_y + + min_xy = sample_center - self.crop_length / 2 + max_xy = sample_center + self.crop_length / 2 + + upper_idx = ( + np.sum((point_cloud[:, :2] <= max_xy).astype(np.int32), 1) == 2 + ) + lower_idx = ( + np.sum((point_cloud[:, :2] >= min_xy).astype(np.int32), 1) == 2 + ) + + new_pointidx = (upper_idx) & (lower_idx) + + if np.sum(new_pointidx) < self.min_points: + print("TOO SMALL") + continue + + return new_pointidx + + # fallback + print("FALLBACK") + return np.ones(point_cloud.shape[0], dtype=np.bool) diff --git a/models/Mask3D/mask3d/datasets/scannet200/__init__.py b/models/Mask3D/mask3d/datasets/scannet200/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/mask3d/datasets/scannet200/scannet200_constants.py b/models/Mask3D/mask3d/datasets/scannet200/scannet200_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..1d921407068335b82ad10af912d7e9d715dbd6ca --- /dev/null +++ b/models/Mask3D/mask3d/datasets/scannet200/scannet200_constants.py @@ -0,0 +1,704 @@ +### ScanNet Benchmark constants ### +VALID_CLASS_IDS_20 = ( + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 14, + 16, + 24, + 28, + 33, + 34, + 36, + 39, +) + +CLASS_LABELS_20 = ( + "wall", + "floor", + "cabinet", + "bed", + "chair", + "sofa", + "table", + "door", + "window", + "bookshelf", + "picture", + "counter", + "desk", + "curtain", + "refrigerator", + "shower curtain", + "toilet", + "sink", + "bathtub", + "otherfurniture", +) + +SCANNET_COLOR_MAP_20 = { + 0: (0.0, 0.0, 0.0), + 1: (174.0, 199.0, 232.0), + 2: (152.0, 223.0, 138.0), + 3: (31.0, 119.0, 180.0), + 4: (255.0, 187.0, 120.0), + 5: (188.0, 189.0, 34.0), + 6: (140.0, 86.0, 75.0), + 7: (255.0, 152.0, 150.0), + 8: (214.0, 39.0, 40.0), + 9: (197.0, 176.0, 213.0), + 10: (148.0, 103.0, 189.0), + 11: (196.0, 156.0, 148.0), + 12: (23.0, 190.0, 207.0), + 14: (247.0, 182.0, 210.0), + 15: (66.0, 188.0, 102.0), + 16: (219.0, 219.0, 141.0), + 17: (140.0, 57.0, 197.0), + 18: (202.0, 185.0, 52.0), + 19: (51.0, 176.0, 203.0), + 20: (200.0, 54.0, 131.0), + 21: (92.0, 193.0, 61.0), + 22: (78.0, 71.0, 183.0), + 23: (172.0, 114.0, 82.0), + 24: (255.0, 127.0, 14.0), + 25: (91.0, 163.0, 138.0), + 26: (153.0, 98.0, 156.0), + 27: (140.0, 153.0, 101.0), + 28: (158.0, 218.0, 229.0), + 29: (100.0, 125.0, 154.0), + 30: (178.0, 127.0, 135.0), + 32: (146.0, 111.0, 194.0), + 33: (44.0, 160.0, 44.0), + 34: (112.0, 128.0, 144.0), + 35: (96.0, 207.0, 209.0), + 36: (227.0, 119.0, 194.0), + 37: (213.0, 92.0, 176.0), + 38: (94.0, 106.0, 211.0), + 39: (82.0, 84.0, 163.0), + 40: (100.0, 85.0, 144.0), +} + +### ScanNet200 Benchmark constants ### +VALID_CLASS_IDS_200 = ( + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 21, + 22, + 23, + 24, + 26, + 27, + 28, + 29, + 31, + 32, + 33, + 34, + 35, + 36, + 38, + 39, + 40, + 41, + 42, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 54, + 55, + 56, + 57, + 58, + 59, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 82, + 84, + 86, + 87, + 88, + 89, + 90, + 93, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 110, + 112, + 115, + 116, + 118, + 120, + 121, + 122, + 125, + 128, + 130, + 131, + 132, + 134, + 136, + 138, + 139, + 140, + 141, + 145, + 148, + 154, + 155, + 156, + 157, + 159, + 161, + 163, + 165, + 166, + 168, + 169, + 170, + 177, + 180, + 185, + 188, + 191, + 193, + 195, + 202, + 208, + 213, + 214, + 221, + 229, + 230, + 232, + 233, + 242, + 250, + 261, + 264, + 276, + 283, + 286, + 300, + 304, + 312, + 323, + 325, + 331, + 342, + 356, + 370, + 392, + 395, + 399, + 408, + 417, + 488, + 540, + 562, + 570, + 572, + 581, + 609, + 748, + 776, + 1156, + 1163, + 1164, + 1165, + 1166, + 1167, + 1168, + 1169, + 1170, + 1171, + 1172, + 1173, + 1174, + 1175, + 1176, + 1178, + 1179, + 1180, + 1181, + 1182, + 1183, + 1184, + 1185, + 1186, + 1187, + 1188, + 1189, + 1190, + 1191, +) + +CLASS_LABELS_200 = ( + "wall", + "chair", + "floor", + "table", + "door", + "couch", + "cabinet", + "shelf", + "desk", + "office chair", + "bed", + "pillow", + "sink", + "picture", + "window", + "toilet", + "bookshelf", + "monitor", + "curtain", + "book", + "armchair", + "coffee table", + "box", + "refrigerator", + "lamp", + "kitchen cabinet", + "towel", + "clothes", + "tv", + "nightstand", + "counter", + "dresser", + "stool", + "cushion", + "plant", + "ceiling", + "bathtub", + "end table", + "dining table", + "keyboard", + "bag", + "backpack", + "toilet paper", + "printer", + "tv stand", + "whiteboard", + "blanket", + "shower curtain", + "trash can", + "closet", + "stairs", + "microwave", + "stove", + "shoe", + "computer tower", + "bottle", + "bin", + "ottoman", + "bench", + "board", + "washing machine", + "mirror", + "copier", + "basket", + "sofa chair", + "file cabinet", + "fan", + "laptop", + "shower", + "paper", + "person", + "paper towel dispenser", + "oven", + "blinds", + "rack", + "plate", + "blackboard", + "piano", + "suitcase", + "rail", + "radiator", + "recycling bin", + "container", + "wardrobe", + "soap dispenser", + "telephone", + "bucket", + "clock", + "stand", + "light", + "laundry basket", + "pipe", + "clothes dryer", + "guitar", + "toilet paper holder", + "seat", + "speaker", + "column", + "bicycle", + "ladder", + "bathroom stall", + "shower wall", + "cup", + "jacket", + "storage bin", + "coffee maker", + "dishwasher", + "paper towel roll", + "machine", + "mat", + "windowsill", + "bar", + "toaster", + "bulletin board", + "ironing board", + "fireplace", + "soap dish", + "kitchen counter", + "doorframe", + "toilet paper dispenser", + "mini fridge", + "fire extinguisher", + "ball", + "hat", + "shower curtain rod", + "water cooler", + "paper cutter", + "tray", + "shower door", + "pillar", + "ledge", + "toaster oven", + "mouse", + "toilet seat cover dispenser", + "furniture", + "cart", + "storage container", + "scale", + "tissue box", + "light switch", + "crate", + "power outlet", + "decoration", + "sign", + "projector", + "closet door", + "vacuum cleaner", + "candle", + "plunger", + "stuffed animal", + "headphones", + "dish rack", + "broom", + "guitar case", + "range hood", + "dustpan", + "hair dryer", + "water bottle", + "handicap bar", + "purse", + "vent", + "shower floor", + "water pitcher", + "mailbox", + "bowl", + "paper bag", + "alarm clock", + "music stand", + "projector screen", + "divider", + "laundry detergent", + "bathroom counter", + "object", + "bathroom vanity", + "closet wall", + "laundry hamper", + "bathroom stall door", + "ceiling light", + "trash bin", + "dumbbell", + "stair rail", + "tube", + "bathroom cabinet", + "cd case", + "closet rod", + "coffee kettle", + "structure", + "shower head", + "keyboard piano", + "case of water bottles", + "coat rack", + "storage organizer", + "folded chair", + "fire alarm", + "power strip", + "calendar", + "poster", + "potted plant", + "luggage", + "mattress", +) + +SCANNET_COLOR_MAP_200 = { + 0: (0.0, 0.0, 0.0), + 1: (174.0, 199.0, 232.0), + 2: (188.0, 189.0, 34.0), + 3: (152.0, 223.0, 138.0), + 4: (255.0, 152.0, 150.0), + 5: (214.0, 39.0, 40.0), + 6: (91.0, 135.0, 229.0), + 7: (31.0, 119.0, 180.0), + 8: (229.0, 91.0, 104.0), + 9: (247.0, 182.0, 210.0), + 10: (91.0, 229.0, 110.0), + 11: (255.0, 187.0, 120.0), + 13: (141.0, 91.0, 229.0), + 14: (112.0, 128.0, 144.0), + 15: (196.0, 156.0, 148.0), + 16: (197.0, 176.0, 213.0), + 17: (44.0, 160.0, 44.0), + 18: (148.0, 103.0, 189.0), + 19: (229.0, 91.0, 223.0), + 21: (219.0, 219.0, 141.0), + 22: (192.0, 229.0, 91.0), + 23: (88.0, 218.0, 137.0), + 24: (58.0, 98.0, 137.0), + 26: (177.0, 82.0, 239.0), + 27: (255.0, 127.0, 14.0), + 28: (237.0, 204.0, 37.0), + 29: (41.0, 206.0, 32.0), + 31: (62.0, 143.0, 148.0), + 32: (34.0, 14.0, 130.0), + 33: (143.0, 45.0, 115.0), + 34: (137.0, 63.0, 14.0), + 35: (23.0, 190.0, 207.0), + 36: (16.0, 212.0, 139.0), + 38: (90.0, 119.0, 201.0), + 39: (125.0, 30.0, 141.0), + 40: (150.0, 53.0, 56.0), + 41: (186.0, 197.0, 62.0), + 42: (227.0, 119.0, 194.0), + 44: (38.0, 100.0, 128.0), + 45: (120.0, 31.0, 243.0), + 46: (154.0, 59.0, 103.0), + 47: (169.0, 137.0, 78.0), + 48: (143.0, 245.0, 111.0), + 49: (37.0, 230.0, 205.0), + 50: (14.0, 16.0, 155.0), + 51: (196.0, 51.0, 182.0), + 52: (237.0, 80.0, 38.0), + 54: (138.0, 175.0, 62.0), + 55: (158.0, 218.0, 229.0), + 56: (38.0, 96.0, 167.0), + 57: (190.0, 77.0, 246.0), + 58: (208.0, 49.0, 84.0), + 59: (208.0, 193.0, 72.0), + 62: (55.0, 220.0, 57.0), + 63: (10.0, 125.0, 140.0), + 64: (76.0, 38.0, 202.0), + 65: (191.0, 28.0, 135.0), + 66: (211.0, 120.0, 42.0), + 67: (118.0, 174.0, 76.0), + 68: (17.0, 242.0, 171.0), + 69: (20.0, 65.0, 247.0), + 70: (208.0, 61.0, 222.0), + 71: (162.0, 62.0, 60.0), + 72: (210.0, 235.0, 62.0), + 73: (45.0, 152.0, 72.0), + 74: (35.0, 107.0, 149.0), + 75: (160.0, 89.0, 237.0), + 76: (227.0, 56.0, 125.0), + 77: (169.0, 143.0, 81.0), + 78: (42.0, 143.0, 20.0), + 79: (25.0, 160.0, 151.0), + 80: (82.0, 75.0, 227.0), + 82: (253.0, 59.0, 222.0), + 84: (240.0, 130.0, 89.0), + 86: (123.0, 172.0, 47.0), + 87: (71.0, 194.0, 133.0), + 88: (24.0, 94.0, 205.0), + 89: (134.0, 16.0, 179.0), + 90: (159.0, 32.0, 52.0), + 93: (213.0, 208.0, 88.0), + 95: (64.0, 158.0, 70.0), + 96: (18.0, 163.0, 194.0), + 97: (65.0, 29.0, 153.0), + 98: (177.0, 10.0, 109.0), + 99: (152.0, 83.0, 7.0), + 100: (83.0, 175.0, 30.0), + 101: (18.0, 199.0, 153.0), + 102: (61.0, 81.0, 208.0), + 103: (213.0, 85.0, 216.0), + 104: (170.0, 53.0, 42.0), + 105: (161.0, 192.0, 38.0), + 106: (23.0, 241.0, 91.0), + 107: (12.0, 103.0, 170.0), + 110: (151.0, 41.0, 245.0), + 112: (133.0, 51.0, 80.0), + 115: (184.0, 162.0, 91.0), + 116: (50.0, 138.0, 38.0), + 118: (31.0, 237.0, 236.0), + 120: (39.0, 19.0, 208.0), + 121: (223.0, 27.0, 180.0), + 122: (254.0, 141.0, 85.0), + 125: (97.0, 144.0, 39.0), + 128: (106.0, 231.0, 176.0), + 130: (12.0, 61.0, 162.0), + 131: (124.0, 66.0, 140.0), + 132: (137.0, 66.0, 73.0), + 134: (250.0, 253.0, 26.0), + 136: (55.0, 191.0, 73.0), + 138: (60.0, 126.0, 146.0), + 139: (153.0, 108.0, 234.0), + 140: (184.0, 58.0, 125.0), + 141: (135.0, 84.0, 14.0), + 145: (139.0, 248.0, 91.0), + 148: (53.0, 200.0, 172.0), + 154: (63.0, 69.0, 134.0), + 155: (190.0, 75.0, 186.0), + 156: (127.0, 63.0, 52.0), + 157: (141.0, 182.0, 25.0), + 159: (56.0, 144.0, 89.0), + 161: (64.0, 160.0, 250.0), + 163: (182.0, 86.0, 245.0), + 165: (139.0, 18.0, 53.0), + 166: (134.0, 120.0, 54.0), + 168: (49.0, 165.0, 42.0), + 169: (51.0, 128.0, 133.0), + 170: (44.0, 21.0, 163.0), + 177: (232.0, 93.0, 193.0), + 180: (176.0, 102.0, 54.0), + 185: (116.0, 217.0, 17.0), + 188: (54.0, 209.0, 150.0), + 191: (60.0, 99.0, 204.0), + 193: (129.0, 43.0, 144.0), + 195: (252.0, 100.0, 106.0), + 202: (187.0, 196.0, 73.0), + 208: (13.0, 158.0, 40.0), + 213: (52.0, 122.0, 152.0), + 214: (128.0, 76.0, 202.0), + 221: (187.0, 50.0, 115.0), + 229: (180.0, 141.0, 71.0), + 230: (77.0, 208.0, 35.0), + 232: (72.0, 183.0, 168.0), + 233: (97.0, 99.0, 203.0), + 242: (172.0, 22.0, 158.0), + 250: (155.0, 64.0, 40.0), + 261: (118.0, 159.0, 30.0), + 264: (69.0, 252.0, 148.0), + 276: (45.0, 103.0, 173.0), + 283: (111.0, 38.0, 149.0), + 286: (184.0, 9.0, 49.0), + 300: (188.0, 174.0, 67.0), + 304: (53.0, 206.0, 53.0), + 312: (97.0, 235.0, 252.0), + 323: (66.0, 32.0, 182.0), + 325: (236.0, 114.0, 195.0), + 331: (241.0, 154.0, 83.0), + 342: (133.0, 240.0, 52.0), + 356: (16.0, 205.0, 144.0), + 370: (75.0, 101.0, 198.0), + 392: (237.0, 95.0, 251.0), + 395: (191.0, 52.0, 49.0), + 399: (227.0, 254.0, 54.0), + 408: (49.0, 206.0, 87.0), + 417: (48.0, 113.0, 150.0), + 488: (125.0, 73.0, 182.0), + 540: (229.0, 32.0, 114.0), + 562: (158.0, 119.0, 28.0), + 570: (60.0, 205.0, 27.0), + 572: (18.0, 215.0, 201.0), + 581: (79.0, 76.0, 153.0), + 609: (134.0, 13.0, 116.0), + 748: (192.0, 97.0, 63.0), + 776: (108.0, 163.0, 18.0), + 1156: (95.0, 220.0, 156.0), + 1163: (98.0, 141.0, 208.0), + 1164: (144.0, 19.0, 193.0), + 1165: (166.0, 36.0, 57.0), + 1166: (212.0, 202.0, 34.0), + 1167: (23.0, 206.0, 34.0), + 1168: (91.0, 211.0, 236.0), + 1169: (79.0, 55.0, 137.0), + 1170: (182.0, 19.0, 117.0), + 1171: (134.0, 76.0, 14.0), + 1172: (87.0, 185.0, 28.0), + 1173: (82.0, 224.0, 187.0), + 1174: (92.0, 110.0, 214.0), + 1175: (168.0, 80.0, 171.0), + 1176: (197.0, 63.0, 51.0), + 1178: (175.0, 199.0, 77.0), + 1179: (62.0, 180.0, 98.0), + 1180: (8.0, 91.0, 150.0), + 1181: (77.0, 15.0, 130.0), + 1182: (154.0, 65.0, 96.0), + 1183: (197.0, 152.0, 11.0), + 1184: (59.0, 155.0, 45.0), + 1185: (12.0, 147.0, 145.0), + 1186: (54.0, 35.0, 219.0), + 1187: (210.0, 73.0, 181.0), + 1188: (221.0, 124.0, 77.0), + 1189: (149.0, 214.0, 66.0), + 1190: (72.0, 185.0, 134.0), + 1191: (42.0, 94.0, 198.0), +} + +### For instance segmentation the non-object categories ### +VALID_PANOPTIC_IDS = (1, 3) + +CLASS_LABELS_PANOPTIC = ("wall", "floor") diff --git a/models/Mask3D/mask3d/datasets/scannet200/scannet200_splits.py b/models/Mask3D/mask3d/datasets/scannet200/scannet200_splits.py new file mode 100644 index 0000000000000000000000000000000000000000..3a5585f70319d1eb061669bd82bbf3d64d0bca7b --- /dev/null +++ b/models/Mask3D/mask3d/datasets/scannet200/scannet200_splits.py @@ -0,0 +1,625 @@ +### This file contains the HEAD - COMMON - TAIL split category ids for ScanNet 200 + +HEAD_CATS_SCANNET_200 = [ + "tv stand", + "curtain", + "blinds", + "shower curtain", + "bookshelf", + "tv", + "kitchen cabinet", + "pillow", + "lamp", + "dresser", + "monitor", + "object", + "ceiling", + "board", + "stove", + "closet wall", + "couch", + "office chair", + "kitchen counter", + "shower", + "closet", + "doorframe", + "sofa chair", + "mailbox", + "nightstand", + "washing machine", + "picture", + "book", + "sink", + "recycling bin", + "table", + "backpack", + "shower wall", + "toilet", + "copier", + "counter", + "stool", + "refrigerator", + "window", + "file cabinet", + "chair", + "wall", + "plant", + "coffee table", + "stairs", + "armchair", + "cabinet", + "bathroom vanity", + "bathroom stall", + "mirror", + "blackboard", + "trash can", + "stair rail", + "box", + "towel", + "door", + "clothes", + "whiteboard", + "bed", + "floor", + "bathtub", + "desk", + "wardrobe", + "clothes dryer", + "radiator", + "shelf", +] +COMMON_CATS_SCANNET_200 = [ + "cushion", + "end table", + "dining table", + "keyboard", + "bag", + "toilet paper", + "printer", + "blanket", + "microwave", + "shoe", + "computer tower", + "bottle", + "bin", + "ottoman", + "bench", + "basket", + "fan", + "laptop", + "person", + "paper towel dispenser", + "oven", + "rack", + "piano", + "suitcase", + "rail", + "container", + "telephone", + "stand", + "light", + "laundry basket", + "pipe", + "seat", + "column", + "bicycle", + "ladder", + "jacket", + "storage bin", + "coffee maker", + "dishwasher", + "machine", + "mat", + "windowsill", + "bulletin board", + "fireplace", + "mini fridge", + "water cooler", + "shower door", + "pillar", + "ledge", + "furniture", + "cart", + "decoration", + "closet door", + "vacuum cleaner", + "dish rack", + "range hood", + "projector screen", + "divider", + "bathroom counter", + "laundry hamper", + "bathroom stall door", + "ceiling light", + "trash bin", + "bathroom cabinet", + "structure", + "storage organizer", + "potted plant", + "mattress", +] +TAIL_CATS_SCANNET_200 = [ + "paper", + "plate", + "soap dispenser", + "bucket", + "clock", + "guitar", + "toilet paper holder", + "speaker", + "cup", + "paper towel roll", + "bar", + "toaster", + "ironing board", + "soap dish", + "toilet paper dispenser", + "fire extinguisher", + "ball", + "hat", + "shower curtain rod", + "paper cutter", + "tray", + "toaster oven", + "mouse", + "toilet seat cover dispenser", + "storage container", + "scale", + "tissue box", + "light switch", + "crate", + "power outlet", + "sign", + "projector", + "candle", + "plunger", + "stuffed animal", + "headphones", + "broom", + "guitar case", + "dustpan", + "hair dryer", + "water bottle", + "handicap bar", + "purse", + "vent", + "shower floor", + "water pitcher", + "bowl", + "paper bag", + "alarm clock", + "music stand", + "laundry detergent", + "dumbbell", + "tube", + "cd case", + "closet rod", + "coffee kettle", + "shower head", + "keyboard piano", + "case of water bottles", + "coat rack", + "folded chair", + "fire alarm", + "power strip", + "calendar", + "poster", + "luggage", +] + + +### Given the different size of the official train and val sets, not all ScanNet200 categories are present in the validation set. +### Here we list of categories with labels and IDs present in both train and validation set, and the remaining categories those are present in train, but not in val +### We dont evaluate on unseen validation categories in this benchmark + +VALID_CLASS_IDS_200_VALIDATION = ( + "wall", + "chair", + "floor", + "table", + "door", + "couch", + "cabinet", + "shelf", + "desk", + "office chair", + "bed", + "pillow", + "sink", + "picture", + "window", + "toilet", + "bookshelf", + "monitor", + "curtain", + "book", + "armchair", + "coffee table", + "box", + "refrigerator", + "lamp", + "kitchen cabinet", + "towel", + "clothes", + "tv", + "nightstand", + "counter", + "dresser", + "stool", + "cushion", + "plant", + "ceiling", + "bathtub", + "end table", + "dining table", + "keyboard", + "bag", + "backpack", + "toilet paper", + "printer", + "tv stand", + "whiteboard", + "blanket", + "shower curtain", + "trash can", + "closet", + "stairs", + "microwave", + "stove", + "shoe", + "computer tower", + "bottle", + "bin", + "ottoman", + "bench", + "board", + "washing machine", + "mirror", + "copier", + "basket", + "sofa chair", + "file cabinet", + "fan", + "laptop", + "shower", + "paper", + "person", + "paper towel dispenser", + "oven", + "blinds", + "rack", + "plate", + "blackboard", + "piano", + "suitcase", + "rail", + "radiator", + "recycling bin", + "container", + "wardrobe", + "soap dispenser", + "telephone", + "bucket", + "clock", + "stand", + "light", + "laundry basket", + "pipe", + "clothes dryer", + "guitar", + "toilet paper holder", + "seat", + "speaker", + "column", + "ladder", + "bathroom stall", + "shower wall", + "cup", + "jacket", + "storage bin", + "coffee maker", + "dishwasher", + "paper towel roll", + "machine", + "mat", + "windowsill", + "bar", + "toaster", + "bulletin board", + "ironing board", + "fireplace", + "soap dish", + "kitchen counter", + "doorframe", + "toilet paper dispenser", + "mini fridge", + "fire extinguisher", + "ball", + "hat", + "shower curtain rod", + "water cooler", + "paper cutter", + "tray", + "shower door", + "pillar", + "ledge", + "toaster oven", + "mouse", + "toilet seat cover dispenser", + "furniture", + "cart", + "scale", + "tissue box", + "light switch", + "crate", + "power outlet", + "decoration", + "sign", + "projector", + "closet door", + "vacuum cleaner", + "plunger", + "stuffed animal", + "headphones", + "dish rack", + "broom", + "range hood", + "dustpan", + "hair dryer", + "water bottle", + "handicap bar", + "vent", + "shower floor", + "water pitcher", + "mailbox", + "bowl", + "paper bag", + "projector screen", + "divider", + "laundry detergent", + "bathroom counter", + "object", + "bathroom vanity", + "closet wall", + "laundry hamper", + "bathroom stall door", + "ceiling light", + "trash bin", + "dumbbell", + "stair rail", + "tube", + "bathroom cabinet", + "closet rod", + "coffee kettle", + "shower head", + "keyboard piano", + "case of water bottles", + "coat rack", + "folded chair", + "fire alarm", + "power strip", + "calendar", + "poster", + "potted plant", + "mattress", +) + +CLASS_LABELS_200_VALIDATION = ( + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 21, + 22, + 23, + 24, + 26, + 27, + 28, + 29, + 31, + 32, + 33, + 34, + 35, + 36, + 38, + 39, + 40, + 41, + 42, + 44, + 45, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 54, + 55, + 56, + 57, + 58, + 59, + 62, + 63, + 64, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 82, + 84, + 86, + 87, + 88, + 89, + 90, + 93, + 95, + 96, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 106, + 107, + 110, + 112, + 115, + 116, + 118, + 120, + 122, + 125, + 128, + 130, + 131, + 132, + 134, + 136, + 138, + 139, + 140, + 141, + 145, + 148, + 154, + 155, + 156, + 157, + 159, + 161, + 163, + 165, + 166, + 168, + 169, + 170, + 177, + 180, + 185, + 188, + 191, + 193, + 195, + 202, + 208, + 213, + 214, + 229, + 230, + 232, + 233, + 242, + 250, + 261, + 264, + 276, + 283, + 300, + 304, + 312, + 323, + 325, + 342, + 356, + 370, + 392, + 395, + 408, + 417, + 488, + 540, + 562, + 570, + 609, + 748, + 776, + 1156, + 1163, + 1164, + 1165, + 1166, + 1167, + 1168, + 1169, + 1170, + 1171, + 1172, + 1173, + 1175, + 1176, + 1179, + 1180, + 1181, + 1182, + 1184, + 1185, + 1186, + 1187, + 1188, + 1189, + 1191, +) + +VALID_CLASS_IDS_200_TRAIN_ONLY = ( + "bicycle", + "storage container", + "candle", + "guitar case", + "purse", + "alarm clock", + "music stand", + "cd case", + "structure", + "storage organizer", + "luggage", +) + +CLASS_LABELS_200_TRAIN_ONLY = ( + 121, + 221, + 286, + 331, + 399, + 572, + 581, + 1174, + 1178, + 1183, + 1190, +) diff --git a/models/Mask3D/mask3d/datasets/semseg.py b/models/Mask3D/mask3d/datasets/semseg.py new file mode 100644 index 0000000000000000000000000000000000000000..a848b1a20e4690971bf16790fcea00ade84441c0 --- /dev/null +++ b/models/Mask3D/mask3d/datasets/semseg.py @@ -0,0 +1,993 @@ +import logging +from itertools import product +from pathlib import Path +from random import random, sample, uniform +from typing import List, Optional, Tuple, Union +from random import choice +from copy import deepcopy +from random import randrange + + +import numpy +import torch +from datasets.random_cuboid import RandomCuboid + +import albumentations as A +import numpy as np +import scipy +import volumentations as V +import yaml + +# from yaml import CLoader as Loader +from torch.utils.data import Dataset +from datasets.scannet200.scannet200_constants import ( + SCANNET_COLOR_MAP_200, + SCANNET_COLOR_MAP_20, +) + +logger = logging.getLogger(__name__) + + +class SemanticSegmentationDataset(Dataset): + """Docstring for SemanticSegmentationDataset.""" + + def __init__( + self, + dataset_name="scannet", + data_dir: Optional[Union[str, Tuple[str]]] = "data/processed/scannet", + label_db_filepath: Optional[ + str + ] = "configs/scannet_preprocessing/label_database.yaml", + # mean std values from scannet + color_mean_std: Optional[Union[str, Tuple[Tuple[float]]]] = ( + (0.47793125906962, 0.4303257521323044, 0.3749598901421883), + (0.2834475483823543, 0.27566157565723015, 0.27018971370874995), + ), + mode: Optional[str] = "train", + add_colors: Optional[bool] = True, + add_normals: Optional[bool] = True, + add_raw_coordinates: Optional[bool] = False, + add_instance: Optional[bool] = False, + num_labels: Optional[int] = -1, + data_percent: Optional[float] = 1.0, + ignore_label: Optional[Union[int, Tuple[int]]] = 255, + volume_augmentations_path: Optional[str] = None, + image_augmentations_path: Optional[str] = None, + instance_oversampling=0, + place_around_existing=False, + max_cut_region=0, + point_per_cut=100, + flip_in_center=False, + noise_rate=0.0, + resample_points=0.0, + cache_data=False, + add_unlabeled_pc=False, + task="instance_segmentation", + cropping=False, + cropping_args=None, + is_tta=False, + crop_min_size=20000, + crop_length=6.0, + cropping_v1=True, + reps_per_epoch=1, + area=-1, + on_crops=False, + eval_inner_core=-1, + filter_out_classes=[], + label_offset=0, + add_clip=False, + is_elastic_distortion=True, + color_drop=0.0, + ): + assert task in [ + "instance_segmentation", + "semantic_segmentation", + ], "unknown task" + + self.add_clip = add_clip + self.dataset_name = dataset_name + self.is_elastic_distortion = is_elastic_distortion + self.color_drop = color_drop + + if self.dataset_name == "scannet": + self.color_map = SCANNET_COLOR_MAP_20 + self.color_map[255] = (255, 255, 255) + elif self.dataset_name == "stpls3d": + self.color_map = { + 0: [0, 255, 0], # Ground + 1: [0, 0, 255], # Build + 2: [0, 255, 255], # LowVeg + 3: [255, 255, 0], # MediumVeg + 4: [255, 0, 255], # HiVeg + 5: [100, 100, 255], # Vehicle + 6: [200, 200, 100], # Truck + 7: [170, 120, 200], # Aircraft + 8: [255, 0, 0], # MilitaryVec + 9: [200, 100, 100], # Bike + 10: [10, 200, 100], # Motorcycle + 11: [200, 200, 200], # LightPole + 12: [50, 50, 50], # StreetSign + 13: [60, 130, 60], # Clutter + 14: [130, 30, 60], + } # Fence + elif self.dataset_name == "scannet200": + self.color_map = SCANNET_COLOR_MAP_200 + elif self.dataset_name == "s3dis": + self.color_map = { + 0: [0, 255, 0], # ceiling + 1: [0, 0, 255], # floor + 2: [0, 255, 255], # wall + 3: [255, 255, 0], # beam + 4: [255, 0, 255], # column + 5: [100, 100, 255], # window + 6: [200, 200, 100], # door + 7: [170, 120, 200], # table + 8: [255, 0, 0], # chair + 9: [200, 100, 100], # sofa + 10: [10, 200, 100], # bookcase + 11: [200, 200, 200], # board + 12: [50, 50, 50], # clutter + } + else: + assert False, "dataset not known" + + self.task = task + + self.filter_out_classes = filter_out_classes + self.label_offset = label_offset + + self.area = area + self.eval_inner_core = eval_inner_core + + self.reps_per_epoch = reps_per_epoch + + self.cropping = cropping + self.cropping_args = cropping_args + self.is_tta = is_tta + self.on_crops = on_crops + + self.crop_min_size = crop_min_size + self.crop_length = crop_length + + self.version1 = cropping_v1 + + self.random_cuboid = RandomCuboid( + self.crop_min_size, + crop_length=self.crop_length, + version1=self.version1, + ) + + self.mode = mode + self.data_dir = data_dir + self.add_unlabeled_pc = add_unlabeled_pc + if add_unlabeled_pc: + self.other_database = self._load_yaml( + Path(data_dir).parent / "matterport" / "train_database.yaml" + ) + if type(data_dir) == str: + self.data_dir = [self.data_dir] + self.ignore_label = ignore_label + self.add_colors = add_colors + self.add_normals = add_normals + self.add_instance = add_instance + self.add_raw_coordinates = add_raw_coordinates + self.instance_oversampling = instance_oversampling + self.place_around_existing = place_around_existing + self.max_cut_region = max_cut_region + self.point_per_cut = point_per_cut + self.flip_in_center = flip_in_center + self.noise_rate = noise_rate + self.resample_points = resample_points + + # loading database files + self._data = [] + for database_path in self.data_dir: + database_path = Path(database_path) + mode = 'Validation' + if self.dataset_name != "s3dis": + if not (database_path / f"{mode}_database.yaml").exists(): + print( + f"generate {database_path}/{mode}_database.yaml first" + ) + exit() + self._data.extend( + self._load_yaml(database_path / f"{mode}_database.yaml") + ) + else: + # mode_s3dis = f"Area_{self.area}" + mode_s3dis = "Validation" + if self.mode == "train": + mode_s3dis = "train_" + mode_s3dis + if not ( + database_path / f"{mode_s3dis}_database.yaml" + ).exists(): + print( + f"generate {database_path}/{mode_s3dis}_database.yaml first" + ) + exit() + self._data.extend( + self._load_yaml( + database_path / f"{mode_s3dis}_database.yaml" + ) + ) + if data_percent < 1.0: + self._data = sample( + self._data, int(len(self._data) * data_percent) + ) + # labels = self._load_yaml(Path(label_db_filepath)) + + # if working only on classes for validation - discard others + # self._labels = self._select_correct_labels(labels, num_labels) + + if instance_oversampling > 0: + self.instance_data = self._load_yaml( + Path(label_db_filepath).parent / "instance_database.yaml" + ) + + # normalize color channels + if self.dataset_name == "s3dis": + color_mean_std = color_mean_std.replace( + "color_mean_std.yaml", f"Area_{self.area}_color_mean_std.yaml" + ) + + if Path(str(color_mean_std)).exists(): + color_mean_std = self._load_yaml(color_mean_std) + color_mean, color_std = ( + tuple(color_mean_std["mean"]), + tuple(color_mean_std["std"]), + ) + elif len(color_mean_std[0]) == 3 and len(color_mean_std[1]) == 3: + color_mean, color_std = color_mean_std[0], color_mean_std[1] + else: + logger.error( + "pass mean and std as tuple of tuples, or as an .yaml file" + ) + + # augmentations + self.volume_augmentations = V.NoOp() + if (volume_augmentations_path is not None) and ( + volume_augmentations_path != "none" + ): + self.volume_augmentations = V.load( + Path(volume_augmentations_path), data_format="yaml" + ) + self.image_augmentations = A.NoOp() + if (image_augmentations_path is not None) and ( + image_augmentations_path != "none" + ): + self.image_augmentations = A.load( + Path(image_augmentations_path), data_format="yaml" + ) + # mandatory color augmentation + if add_colors: + self.normalize_color = A.Normalize(mean=color_mean, std=color_std) + + self.cache_data = cache_data + # new_data = [] + if self.cache_data: + new_data = [] + for i in range(len(self._data)): + self._data[i]["data"] = np.load( + self.data[i]["filepath"].replace("../../", "") + ) + if self.on_crops: + if self.eval_inner_core == -1: + for block_id, block in enumerate( + self.splitPointCloud(self._data[i]["data"]) + ): + if len(block) > 10000: + new_data.append( + { + "instance_gt_filepath": self._data[i][ + "instance_gt_filepath" + ][block_id] + if len( + self._data[i][ + "instance_gt_filepath" + ] + ) + > 0 + else list(), + "scene": f"{self._data[i]['scene'].replace('.txt', '')}_{block_id}.txt", + "raw_filepath": f"{self.data[i]['filepath'].replace('.npy', '')}_{block_id}", + "data": block, + } + ) + else: + assert False + else: + conds_inner, blocks_outer = self.splitPointCloud( + self._data[i]["data"], + size=self.crop_length, + inner_core=self.eval_inner_core, + ) + + for block_id in range(len(conds_inner)): + cond_inner = conds_inner[block_id] + block_outer = blocks_outer[block_id] + + if cond_inner.sum() > 10000: + new_data.append( + { + "instance_gt_filepath": self._data[i][ + "instance_gt_filepath" + ][block_id] + if len( + self._data[i][ + "instance_gt_filepath" + ] + ) + > 0 + else list(), + "scene": f"{self._data[i]['scene'].replace('.txt', '')}_{block_id}.txt", + "raw_filepath": f"{self.data[i]['filepath'].replace('.npy', '')}_{block_id}", + "data": block_outer, + "cond_inner": cond_inner, + } + ) + else: + assert False + + if self.on_crops: + self._data = new_data + # new_data.append(np.load(self.data[i]["filepath"].replace("../../", ""))) + # self._data = new_data + + def splitPointCloud(self, cloud, size=50.0, stride=50, inner_core=-1): + if inner_core == -1: + limitMax = np.amax(cloud[:, 0:3], axis=0) + width = int(np.ceil((limitMax[0] - size) / stride)) + 1 + depth = int(np.ceil((limitMax[1] - size) / stride)) + 1 + cells = [ + (x * stride, y * stride) + for x in range(width) + for y in range(depth) + ] + blocks = [] + for (x, y) in cells: + xcond = (cloud[:, 0] <= x + size) & (cloud[:, 0] >= x) + ycond = (cloud[:, 1] <= y + size) & (cloud[:, 1] >= y) + cond = xcond & ycond + block = cloud[cond, :] + blocks.append(block) + return blocks + else: + limitMax = np.amax(cloud[:, 0:3], axis=0) + width = int(np.ceil((limitMax[0] - inner_core) / stride)) + 1 + depth = int(np.ceil((limitMax[1] - inner_core) / stride)) + 1 + cells = [ + (x * stride, y * stride) + for x in range(width) + for y in range(depth) + ] + blocks_outer = [] + conds_inner = [] + for (x, y) in cells: + xcond_outer = ( + cloud[:, 0] <= x + inner_core / 2.0 + size / 2 + ) & (cloud[:, 0] >= x + inner_core / 2.0 - size / 2) + ycond_outer = ( + cloud[:, 1] <= y + inner_core / 2.0 + size / 2 + ) & (cloud[:, 1] >= y + inner_core / 2.0 - size / 2) + + cond_outer = xcond_outer & ycond_outer + block_outer = cloud[cond_outer, :] + + xcond_inner = (block_outer[:, 0] <= x + inner_core) & ( + block_outer[:, 0] >= x + ) + ycond_inner = (block_outer[:, 1] <= y + inner_core) & ( + block_outer[:, 1] >= y + ) + + cond_inner = xcond_inner & ycond_inner + + conds_inner.append(cond_inner) + blocks_outer.append(block_outer) + return conds_inner, blocks_outer + + def map2color(self, labels): + output_colors = list() + + for label in labels: + output_colors.append(self.color_map[label]) + + return torch.tensor(output_colors) + + def __len__(self): + if self.is_tta: + return 5 * len(self.data) + else: + return self.reps_per_epoch * len(self.data) + + def __getitem__(self, idx: int): + idx = idx % len(self.data) + if self.is_tta: + idx = idx % len(self.data) + + if self.cache_data: + points = self.data[idx]["data"] + else: + assert not self.on_crops, "you need caching if on crops" + points = np.load(self.data[idx]["filepath"].replace("../../", "")) + + if "train" in self.mode and self.dataset_name in ["s3dis", "stpls3d"]: + inds = self.random_cuboid(points) + points = points[inds] + + coordinates, color, normals, segments, labels = ( + points[:, :3], + points[:, 3:6], + points[:, 6:9], + points[:, 9], + points[:, 10:12], + ) + + raw_coordinates = coordinates.copy() + raw_color = color + raw_normals = normals + + if not self.add_colors: + color = np.ones((len(color), 3)) + + # volume and image augmentations for train + if "train" in self.mode or self.is_tta: + if self.cropping: + new_idx = self.random_cuboid( + coordinates, + labels[:, 1], + self._remap_from_zero(labels[:, 0].copy()), + ) + + coordinates = coordinates[new_idx] + color = color[new_idx] + labels = labels[new_idx] + segments = segments[new_idx] + raw_color = raw_color[new_idx] + raw_normals = raw_normals[new_idx] + normals = normals[new_idx] + points = points[new_idx] + + coordinates -= coordinates.mean(0) + + try: + coordinates += ( + np.random.uniform(coordinates.min(0), coordinates.max(0)) + / 2 + ) + except OverflowError as err: + print(coordinates) + print(coordinates.shape) + raise err + + if self.instance_oversampling > 0.0: + ( + coordinates, + color, + normals, + labels, + ) = self.augment_individual_instance( + coordinates, + color, + normals, + labels, + self.instance_oversampling, + ) + + if self.flip_in_center: + coordinates = flip_in_center(coordinates) + + for i in (0, 1): + if random() < 0.5: + coord_max = np.max(points[:, i]) + coordinates[:, i] = coord_max - coordinates[:, i] + + if random() < 0.95: + if self.is_elastic_distortion: + for granularity, magnitude in ((0.2, 0.4), (0.8, 1.6)): + coordinates = elastic_distortion( + coordinates, granularity, magnitude + ) + aug = self.volume_augmentations( + points=coordinates, + normals=normals, + features=color, + labels=labels, + ) + coordinates, color, normals, labels = ( + aug["points"], + aug["features"], + aug["normals"], + aug["labels"], + ) + pseudo_image = color.astype(np.uint8)[np.newaxis, :, :] + color = np.squeeze( + self.image_augmentations(image=pseudo_image)["image"] + ) + + if self.point_per_cut != 0: + number_of_cuts = int(len(coordinates) / self.point_per_cut) + for _ in range(number_of_cuts): + size_of_cut = np.random.uniform(0.05, self.max_cut_region) + # not wall, floor or empty + point = choice(coordinates) + x_min = point[0] - size_of_cut + x_max = x_min + size_of_cut + y_min = point[1] - size_of_cut + y_max = y_min + size_of_cut + z_min = point[2] - size_of_cut + z_max = z_min + size_of_cut + indexes = crop( + coordinates, x_min, y_min, z_min, x_max, y_max, z_max + ) + coordinates, normals, color, labels = ( + coordinates[~indexes], + normals[~indexes], + color[~indexes], + labels[~indexes], + ) + + # if self.noise_rate > 0: + # coordinates, color, normals, labels = random_points( + # coordinates, + # color, + # normals, + # labels, + # self.noise_rate, + # self.ignore_label, + # ) + + if (self.resample_points > 0) or (self.noise_rate > 0): + coordinates, color, normals, labels = random_around_points( + coordinates, + color, + normals, + labels, + self.resample_points, + self.noise_rate, + self.ignore_label, + ) + + if self.add_unlabeled_pc: + if random() < 0.8: + new_points = np.load( + self.other_database[ + np.random.randint(0, len(self.other_database) - 1) + ]["filepath"] + ) + ( + unlabeled_coords, + unlabeled_color, + unlabeled_normals, + unlabeled_labels, + ) = ( + new_points[:, :3], + new_points[:, 3:6], + new_points[:, 6:9], + new_points[:, 9:], + ) + unlabeled_coords -= unlabeled_coords.mean(0) + unlabeled_coords += ( + np.random.uniform( + unlabeled_coords.min(0), unlabeled_coords.max(0) + ) + / 2 + ) + + aug = self.volume_augmentations( + points=unlabeled_coords, + normals=unlabeled_normals, + features=unlabeled_color, + labels=unlabeled_labels, + ) + ( + unlabeled_coords, + unlabeled_color, + unlabeled_normals, + unlabeled_labels, + ) = ( + aug["points"], + aug["features"], + aug["normals"], + aug["labels"], + ) + pseudo_image = unlabeled_color.astype(np.uint8)[ + np.newaxis, :, : + ] + unlabeled_color = np.squeeze( + self.image_augmentations(image=pseudo_image)["image"] + ) + + coordinates = np.concatenate( + (coordinates, unlabeled_coords) + ) + color = np.concatenate((color, unlabeled_color)) + normals = np.concatenate((normals, unlabeled_normals)) + labels = np.concatenate( + ( + labels, + np.full_like(unlabeled_labels, self.ignore_label), + ) + ) + + if random() < self.color_drop: + color[:] = 255 + + # normalize color information + pseudo_image = color.astype(np.uint8)[np.newaxis, :, :] + color = np.squeeze(self.normalize_color(image=pseudo_image)["image"]) + + # prepare labels and map from 0 to 20(40) + labels = labels.astype(np.int32) + # if labels.size > 0: + # labels[:, 0] = self._remap_from_zero(labels[:, 0]) + # if not self.add_instance: + # # taking only first column, which is segmentation label, not instance + # labels = labels[:, 0].flatten()[..., None] + + labels = np.hstack((labels, segments[..., None].astype(np.int32))) + + features = color + if self.add_normals: + features = np.hstack((features, normals)) + if self.add_raw_coordinates: + if len(features.shape) == 1: + features = np.hstack((features[None, ...], coordinates)) + else: + features = np.hstack((features, coordinates)) + + # if self.task != "semantic_segmentation": + if self.data[idx]["raw_filepath"].split("/")[-2] in [ + "scene0636_00", + "scene0154_00", + ]: + return self.__getitem__(0) + + if self.dataset_name == "s3dis": + return ( + coordinates, + features, + labels, + self.data[idx]["area"] + "_" + self.data[idx]["scene"], + raw_color, + raw_normals, + raw_coordinates, + idx, + ) + if self.dataset_name == "stpls3d": + if labels.shape[1] != 1: # only segments --> test set! + if np.unique(labels[:, -2]).shape[0] < 2: + print("NO INSTANCES") + return self.__getitem__(0) + return ( + coordinates, + features, + labels, + self.data[idx]["scene"], + raw_color, + raw_normals, + raw_coordinates, + idx, + ) + else: + return ( + coordinates, + features, + labels, + self.data[idx]["raw_filepath"].split("/")[-2], + raw_color, + raw_normals, + raw_coordinates, + idx, + ) + + @property + def data(self): + """database file containing information about preproscessed dataset""" + return self._data + + @property + def label_info(self): + """database file containing information labels used by dataset""" + return self._labels + + @staticmethod + def _load_yaml(filepath): + with open(filepath) as f: + # file = yaml.load(f, Loader=Loader) + file = yaml.load(f) + return file + + def _select_correct_labels(self, labels, num_labels): + number_of_validation_labels = 0 + number_of_all_labels = 0 + for ( + k, + v, + ) in labels.items(): + number_of_all_labels += 1 + if v["validation"]: + number_of_validation_labels += 1 + + if num_labels == number_of_all_labels: + return labels + elif num_labels == number_of_validation_labels: + valid_labels = dict() + for ( + k, + v, + ) in labels.items(): + if v["validation"]: + valid_labels.update({k: v}) + return valid_labels + else: + msg = f"""not available number labels, select from: + {number_of_validation_labels}, {number_of_all_labels}""" + raise ValueError(msg) + + def _remap_from_zero(self, labels): + labels[ + ~np.isin(labels, list(self.label_info.keys())) + ] = self.ignore_label + # remap to the range from 0 + for i, k in enumerate(self.label_info.keys()): + labels[labels == k] = i + return labels + + def _remap_model_output(self, output): + output = np.array(output) + output_remapped = output.copy() + for i, k in enumerate(self.label_info.keys()): + output_remapped[output == i] = k + return output_remapped + + def augment_individual_instance( + self, coordinates, color, normals, labels, oversampling=1.0 + ): + max_instance = int(len(np.unique(labels[:, 1]))) + # randomly selecting half of non-zero instances + for instance in range(0, int(max_instance * oversampling)): + if self.place_around_existing: + center = choice( + coordinates[ + labels[:, 1] == choice(np.unique(labels[:, 1])) + ] + ) + else: + center = np.array( + [uniform(-5, 5), uniform(-5, 5), uniform(-0.5, 2)] + ) + instance = choice(choice(self.instance_data)) + instance = np.load(instance["instance_filepath"]) + # centering two objects + instance[:, :3] = ( + instance[:, :3] - instance[:, :3].mean(axis=0) + center + ) + max_instance = max_instance + 1 + instance[:, -1] = max_instance + aug = V.Compose( + [ + V.Scale3d(), + V.RotateAroundAxis3d( + rotation_limit=np.pi / 24, axis=(1, 0, 0) + ), + V.RotateAroundAxis3d( + rotation_limit=np.pi / 24, axis=(0, 1, 0) + ), + V.RotateAroundAxis3d(rotation_limit=np.pi, axis=(0, 0, 1)), + ] + )( + points=instance[:, :3], + features=instance[:, 3:6], + normals=instance[:, 6:9], + labels=instance[:, 9:], + ) + coordinates = np.concatenate((coordinates, aug["points"])) + color = np.concatenate((color, aug["features"])) + normals = np.concatenate((normals, aug["normals"])) + labels = np.concatenate((labels, aug["labels"])) + + return coordinates, color, normals, labels + + +def elastic_distortion(pointcloud, granularity, magnitude): + """Apply elastic distortion on sparse coordinate space. + + pointcloud: numpy array of (number of points, at least 3 spatial dims) + granularity: size of the noise grid (in same scale[m/cm] as the voxel grid) + magnitude: noise multiplier + """ + blurx = np.ones((3, 1, 1, 1)).astype("float32") / 3 + blury = np.ones((1, 3, 1, 1)).astype("float32") / 3 + blurz = np.ones((1, 1, 3, 1)).astype("float32") / 3 + coords = pointcloud[:, :3] + coords_min = coords.min(0) + + # Create Gaussian noise tensor of the size given by granularity. + noise_dim = ((coords - coords_min).max(0) // granularity).astype(int) + 3 + noise = np.random.randn(*noise_dim, 3).astype(np.float32) + + # Smoothing. + for _ in range(2): + noise = scipy.ndimage.filters.convolve( + noise, blurx, mode="constant", cval=0 + ) + noise = scipy.ndimage.filters.convolve( + noise, blury, mode="constant", cval=0 + ) + noise = scipy.ndimage.filters.convolve( + noise, blurz, mode="constant", cval=0 + ) + + # Trilinear interpolate noise filters for each spatial dimensions. + ax = [ + np.linspace(d_min, d_max, d) + for d_min, d_max, d in zip( + coords_min - granularity, + coords_min + granularity * (noise_dim - 2), + noise_dim, + ) + ] + interp = scipy.interpolate.RegularGridInterpolator( + ax, noise, bounds_error=0, fill_value=0 + ) + pointcloud[:, :3] = coords + interp(coords) * magnitude + return pointcloud + + +def crop(points, x_min, y_min, z_min, x_max, y_max, z_max): + if x_max <= x_min or y_max <= y_min or z_max <= z_min: + raise ValueError( + "We should have x_min < x_max and y_min < y_max and z_min < z_max. But we got" + " (x_min = {x_min}, y_min = {y_min}, z_min = {z_min}," + " x_max = {x_max}, y_max = {y_max}, z_max = {z_max})".format( + x_min=x_min, + x_max=x_max, + y_min=y_min, + y_max=y_max, + z_min=z_min, + z_max=z_max, + ) + ) + inds = np.all( + [ + (points[:, 0] >= x_min), + (points[:, 0] < x_max), + (points[:, 1] >= y_min), + (points[:, 1] < y_max), + (points[:, 2] >= z_min), + (points[:, 2] < z_max), + ], + axis=0, + ) + return inds + + +def flip_in_center(coordinates): + # moving coordinates to center + coordinates -= coordinates.mean(0) + aug = V.Compose( + [ + V.Flip3d(axis=(0, 1, 0), always_apply=True), + V.Flip3d(axis=(1, 0, 0), always_apply=True), + ] + ) + + first_crop = coordinates[:, 0] > 0 + first_crop &= coordinates[:, 1] > 0 + # x -y + second_crop = coordinates[:, 0] > 0 + second_crop &= coordinates[:, 1] < 0 + # -x y + third_crop = coordinates[:, 0] < 0 + third_crop &= coordinates[:, 1] > 0 + # -x -y + fourth_crop = coordinates[:, 0] < 0 + fourth_crop &= coordinates[:, 1] < 0 + + if first_crop.size > 1: + coordinates[first_crop] = aug(points=coordinates[first_crop])["points"] + if second_crop.size > 1: + minimum = coordinates[second_crop].min(0) + minimum[2] = 0 + minimum[0] = 0 + coordinates[second_crop] = aug(points=coordinates[second_crop])[ + "points" + ] + coordinates[second_crop] += minimum + if third_crop.size > 1: + minimum = coordinates[third_crop].min(0) + minimum[2] = 0 + minimum[1] = 0 + coordinates[third_crop] = aug(points=coordinates[third_crop])["points"] + coordinates[third_crop] += minimum + if fourth_crop.size > 1: + minimum = coordinates[fourth_crop].min(0) + minimum[2] = 0 + coordinates[fourth_crop] = aug(points=coordinates[fourth_crop])[ + "points" + ] + coordinates[fourth_crop] += minimum + + return coordinates + + +def random_around_points( + coordinates, + color, + normals, + labels, + rate=0.2, + noise_rate=0, + ignore_label=255, +): + coord_indexes = sample( + list(range(len(coordinates))), k=int(len(coordinates) * rate) + ) + noisy_coordinates = deepcopy(coordinates[coord_indexes]) + noisy_coordinates += np.random.uniform( + -0.2 - noise_rate, 0.2 + noise_rate, size=noisy_coordinates.shape + ) + + if noise_rate > 0: + noisy_color = np.random.randint(0, 255, size=noisy_coordinates.shape) + noisy_normals = np.random.rand(*noisy_coordinates.shape) * 2 - 1 + noisy_labels = np.full(labels[coord_indexes].shape, ignore_label) + + coordinates = np.vstack((coordinates, noisy_coordinates)) + color = np.vstack((color, noisy_color)) + normals = np.vstack((normals, noisy_normals)) + labels = np.vstack((labels, noisy_labels)) + else: + noisy_color = deepcopy(color[coord_indexes]) + noisy_normals = deepcopy(normals[coord_indexes]) + noisy_labels = deepcopy(labels[coord_indexes]) + + coordinates = np.vstack((coordinates, noisy_coordinates)) + color = np.vstack((color, noisy_color)) + normals = np.vstack((normals, noisy_normals)) + labels = np.vstack((labels, noisy_labels)) + + return coordinates, color, normals, labels + + +def random_points( + coordinates, color, normals, labels, noise_rate=0.6, ignore_label=255 +): + max_boundary = coordinates.max(0) + 0.1 + min_boundary = coordinates.min(0) - 0.1 + + noisy_coordinates = int( + (max(max_boundary) - min(min_boundary)) / noise_rate + ) + + noisy_coordinates = np.array( + list( + product( + np.linspace( + min_boundary[0], max_boundary[0], noisy_coordinates + ), + np.linspace( + min_boundary[1], max_boundary[1], noisy_coordinates + ), + np.linspace( + min_boundary[2], max_boundary[2], noisy_coordinates + ), + ) + ) + ) + noisy_coordinates += np.random.uniform( + -noise_rate, noise_rate, size=noisy_coordinates.shape + ) + + noisy_color = np.random.randint(0, 255, size=noisy_coordinates.shape) + noisy_normals = np.random.rand(*noisy_coordinates.shape) * 2 - 1 + noisy_labels = np.full( + (noisy_coordinates.shape[0], labels.shape[1]), ignore_label + ) + + coordinates = np.vstack((coordinates, noisy_coordinates)) + color = np.vstack((color, noisy_color)) + normals = np.vstack((normals, noisy_normals)) + labels = np.vstack((labels, noisy_labels)) + return coordinates, color, normals, labels diff --git a/models/Mask3D/mask3d/datasets/utils.py b/models/Mask3D/mask3d/datasets/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c91fb68ed0058a264ce76f97a618bca6e7d35a70 --- /dev/null +++ b/models/Mask3D/mask3d/datasets/utils.py @@ -0,0 +1,639 @@ +import MinkowskiEngine as ME +import numpy as np +import torch +from random import random + + +class VoxelizeCollate: + def __init__( + self, + ignore_label=255, + voxel_size=1, + mode="test", + small_crops=False, + very_small_crops=False, + batch_instance=False, + probing=False, + task="instance_segmentation", + ignore_class_threshold=100, + filter_out_classes=[], + label_offset=0, + num_queries=None, + ): + assert task in [ + "instance_segmentation", + "semantic_segmentation", + ], "task not known" + self.task = task + self.filter_out_classes = filter_out_classes + self.label_offset = label_offset + self.voxel_size = voxel_size + self.ignore_label = ignore_label + self.mode = mode + self.batch_instance = batch_instance + self.small_crops = small_crops + self.very_small_crops = very_small_crops + self.probing = probing + self.ignore_class_threshold = ignore_class_threshold + + self.num_queries = num_queries + + def __call__(self, batch): + if ("train" in self.mode) and ( + self.small_crops or self.very_small_crops + ): + batch = make_crops(batch) + if ("train" in self.mode) and self.very_small_crops: + batch = make_crops(batch) + return voxelize( + batch, + self.ignore_label, + self.voxel_size, + self.probing, + self.mode, + task=self.task, + ignore_class_threshold=self.ignore_class_threshold, + filter_out_classes=self.filter_out_classes, + label_offset=self.label_offset, + num_queries=self.num_queries, + ) + + +class VoxelizeCollateMerge: + def __init__( + self, + ignore_label=255, + voxel_size=1, + mode="test", + scenes=2, + small_crops=False, + very_small_crops=False, + batch_instance=False, + make_one_pc_noise=False, + place_nearby=False, + place_far=False, + proba=1, + probing=False, + task="instance_segmentation", + ): + assert task in [ + "instance_segmentation", + "semantic_segmentation", + ], "task not known" + self.task = task + self.mode = mode + self.scenes = scenes + self.small_crops = small_crops + self.very_small_crops = very_small_crops + self.ignore_label = ignore_label + self.voxel_size = voxel_size + self.batch_instance = batch_instance + self.make_one_pc_noise = make_one_pc_noise + self.place_nearby = place_nearby + self.place_far = place_far + self.proba = proba + self.probing = probing + + def __call__(self, batch): + if ( + ("train" in self.mode) + and (not self.make_one_pc_noise) + and (self.proba > random()) + ): + if self.small_crops or self.very_small_crops: + batch = make_crops(batch) + if self.very_small_crops: + batch = make_crops(batch) + if self.batch_instance: + batch = batch_instances(batch) + new_batch = [] + for i in range(0, len(batch), self.scenes): + batch_coordinates = [] + batch_features = [] + batch_labels = [] + + batch_filenames = "" + batch_raw_color = [] + batch_raw_normals = [] + + offset_instance_id = 0 + offset_segment_id = 0 + + for j in range(min(len(batch[i:]), self.scenes)): + batch_coordinates.append(batch[i + j][0]) + batch_features.append(batch[i + j][1]) + + if j == 0: + batch_filenames = batch[i + j][3] + else: + batch_filenames = ( + batch_filenames + f"+{batch[i + j][3]}" + ) + + batch_raw_color.append(batch[i + j][4]) + batch_raw_normals.append(batch[i + j][5]) + + # make instance ids and segment ids unique + # take care that -1 instances stay at -1 + batch_labels.append( + batch[i + j][2] + + [0, offset_instance_id, offset_segment_id] + ) + batch_labels[-1][batch[i + j][2][:, 1] == -1, 1] = -1 + + max_instance_id, max_segment_id = batch[i + j][2].max( + axis=0 + )[1:] + offset_segment_id = offset_segment_id + max_segment_id + 1 + offset_instance_id = ( + offset_instance_id + max_instance_id + 1 + ) + + if (len(batch_coordinates) == 2) and self.place_nearby: + border = batch_coordinates[0][:, 0].max() + border -= batch_coordinates[1][:, 0].min() + batch_coordinates[1][:, 0] += border + elif (len(batch_coordinates) == 2) and self.place_far: + batch_coordinates[1] += ( + np.random.uniform((-10, -10, -10), (10, 10, 10)) * 200 + ) + new_batch.append( + ( + np.vstack(batch_coordinates), + np.vstack(batch_features), + np.concatenate(batch_labels), + batch_filenames, + np.vstack(batch_raw_color), + np.vstack(batch_raw_normals), + ) + ) + # TODO WHAT ABOUT POINT2SEGMENT AND SO ON ... + batch = new_batch + elif ("train" in self.mode) and self.make_one_pc_noise: + new_batch = [] + for i in range(0, len(batch), 2): + if (i + 1) < len(batch): + new_batch.append( + [ + np.vstack((batch[i][0], batch[i + 1][0])), + np.vstack((batch[i][1], batch[i + 1][1])), + np.concatenate( + ( + batch[i][2], + np.full_like( + batch[i + 1][2], self.ignore_label + ), + ) + ), + ] + ) + new_batch.append( + [ + np.vstack((batch[i][0], batch[i + 1][0])), + np.vstack((batch[i][1], batch[i + 1][1])), + np.concatenate( + ( + np.full_like( + batch[i][2], self.ignore_label + ), + batch[i + 1][2], + ) + ), + ] + ) + else: + new_batch.append([batch[i][0], batch[i][1], batch[i][2]]) + batch = new_batch + # return voxelize(batch, self.ignore_label, self.voxel_size, self.probing, self.mode) + return voxelize( + batch, + self.ignore_label, + self.voxel_size, + self.probing, + self.mode, + task=self.task, + ) + + +def batch_instances(batch): + new_batch = [] + for sample in batch: + for instance_id in np.unique(sample[2][:, 1]): + new_batch.append( + ( + sample[0][sample[2][:, 1] == instance_id], + sample[1][sample[2][:, 1] == instance_id], + sample[2][sample[2][:, 1] == instance_id][:, 0], + ), + ) + return new_batch + + +def voxelize( + batch, + ignore_label, + voxel_size, + probing, + mode, + task, + ignore_class_threshold, + filter_out_classes, + label_offset, + num_queries, +): + ( + coordinates, + features, + labels, + original_labels, + inverse_maps, + original_colors, + original_normals, + original_coordinates, + idx, + ) = ([], [], [], [], [], [], [], [], []) + voxelization_dict = { + "ignore_label": ignore_label, + # "quantization_size": self.voxel_size, + "return_index": True, + "return_inverse": True, + } + + full_res_coords = [] + + for sample in batch: + idx.append(sample[7]) + original_coordinates.append(sample[6]) + original_labels.append(sample[2]) + full_res_coords.append(sample[0]) + original_colors.append(sample[4]) + original_normals.append(sample[5]) + + coords = np.floor(sample[0] / voxel_size) + voxelization_dict.update( + { + "coordinates": torch.from_numpy(coords).to("cpu").contiguous(), + "features": sample[1], + } + ) + + # maybe this change (_, _, ...) is not necessary and we can directly get out + # the sample coordinates? + _, _, unique_map, inverse_map = ME.utils.sparse_quantize( + **voxelization_dict + ) + inverse_maps.append(inverse_map) + + sample_coordinates = coords[unique_map] + coordinates.append(torch.from_numpy(sample_coordinates).int()) + sample_features = sample[1][unique_map] + features.append(torch.from_numpy(sample_features).float()) + if len(sample[2]) > 0: + sample_labels = sample[2][unique_map] + labels.append(torch.from_numpy(sample_labels).long()) + + # Concatenate all lists + input_dict = {"coords": coordinates, "feats": features} + if len(labels) > 0: + input_dict["labels"] = labels + coordinates, features, labels = ME.utils.sparse_collate(**input_dict) + else: + coordinates, features = ME.utils.sparse_collate(**input_dict) + labels = torch.Tensor([]) + + if probing: + return ( + NoGpu( + coordinates, + features, + original_labels, + inverse_maps, + ), + labels, + ) + + if mode == "test": + for i in range(len(input_dict["labels"])): + _, ret_index, ret_inv = np.unique( + input_dict["labels"][i][:, 0], + return_index=True, + return_inverse=True, + ) + input_dict["labels"][i][:, 0] = torch.from_numpy(ret_inv) + # input_dict["segment2label"].append(input_dict["labels"][i][ret_index][:, :-1]) + else: + input_dict["segment2label"] = [] + + if "labels" in input_dict: + for i in range(len(input_dict["labels"])): + # TODO BIGGER CHANGE CHECK!!! + _, ret_index, ret_inv = np.unique( + input_dict["labels"][i][:, -1], + return_index=True, + return_inverse=True, + ) + input_dict["labels"][i][:, -1] = torch.from_numpy(ret_inv) + input_dict["segment2label"].append( + input_dict["labels"][i][ret_index][:, :-1] + ) + + if "labels" in input_dict: + list_labels = input_dict["labels"] + + target = [] + target_full = [] + + if len(list_labels[0].shape) == 1: + for batch_id in range(len(list_labels)): + label_ids = list_labels[batch_id].unique() + if 255 in label_ids: + label_ids = label_ids[:-1] + + target.append( + { + "labels": label_ids, + "masks": list_labels[batch_id] + == label_ids.unsqueeze(1), + } + ) + else: + if mode == "test": + for i in range(len(input_dict["labels"])): + target.append( + {"point2segment": input_dict["labels"][i][:, 0]} + ) + target_full.append( + { + "point2segment": torch.from_numpy( + original_labels[i][:, 0] + ).long() + } + ) + else: + target = get_instance_masks( + list_labels, + list_segments=input_dict["segment2label"], + task=task, + ignore_class_threshold=ignore_class_threshold, + filter_out_classes=filter_out_classes, + label_offset=label_offset, + ) + for i in range(len(target)): + target[i]["point2segment"] = input_dict["labels"][i][:, 2] + if "train" not in mode: + target_full = get_instance_masks( + [torch.from_numpy(l) for l in original_labels], + task=task, + ignore_class_threshold=ignore_class_threshold, + filter_out_classes=filter_out_classes, + label_offset=label_offset, + ) + for i in range(len(target_full)): + target_full[i]["point2segment"] = torch.from_numpy( + original_labels[i][:, 2] + ).long() + else: + target = [] + target_full = [] + coordinates = [] + features = [] + + if "train" not in mode: + return ( + NoGpu( + coordinates, + features, + original_labels, + inverse_maps, + full_res_coords, + target_full, + original_colors, + original_normals, + original_coordinates, + idx, + ), + target, + [sample[3] for sample in batch], + ) + else: + return ( + NoGpu( + coordinates, + features, + original_labels, + inverse_maps, + full_res_coords, + ), + target, + [sample[3] for sample in batch], + ) + + +def get_instance_masks( + list_labels, + task, + list_segments=None, + ignore_class_threshold=100, + filter_out_classes=[], + label_offset=0, +): + target = [] + + for batch_id in range(len(list_labels)): + label_ids = [] + masks = [] + segment_masks = [] + instance_ids = list_labels[batch_id][:, 1].unique() + + for instance_id in instance_ids: + if instance_id == -1: + continue + + # TODO is it possible that a ignore class (255) is an instance??? + # instance == -1 ??? + tmp = list_labels[batch_id][ + list_labels[batch_id][:, 1] == instance_id + ] + label_id = tmp[0, 0] + + if ( + label_id in filter_out_classes + ): # floor, wall, undefined==255 is not included + continue + + if ( + 255 in filter_out_classes + and label_id.item() == 255 + and tmp.shape[0] < ignore_class_threshold + ): + continue + + label_ids.append(label_id) + masks.append(list_labels[batch_id][:, 1] == instance_id) + + if list_segments: + segment_mask = torch.zeros( + list_segments[batch_id].shape[0] + ).bool() + segment_mask[ + list_labels[batch_id][ + list_labels[batch_id][:, 1] == instance_id + ][:, 2].unique() + ] = True + segment_masks.append(segment_mask) + + if len(label_ids) == 0: + return list() + + label_ids = torch.stack(label_ids) + masks = torch.stack(masks) + if list_segments: + segment_masks = torch.stack(segment_masks) + + if task == "semantic_segmentation": + new_label_ids = [] + new_masks = [] + new_segment_masks = [] + for label_id in label_ids.unique(): + masking = label_ids == label_id + + new_label_ids.append(label_id) + new_masks.append(masks[masking, :].sum(dim=0).bool()) + + if list_segments: + new_segment_masks.append( + segment_masks[masking, :].sum(dim=0).bool() + ) + + label_ids = torch.stack(new_label_ids) + masks = torch.stack(new_masks) + + if list_segments: + segment_masks = torch.stack(new_segment_masks) + + target.append( + { + "labels": label_ids, + "masks": masks, + "segment_mask": segment_masks, + } + ) + else: + target.append({"labels": label_ids, "masks": masks}) + else: + l = torch.clamp(label_ids - label_offset, min=0) + + if list_segments: + target.append( + { + "labels": l, + "masks": masks, + "segment_mask": segment_masks, + } + ) + else: + target.append({"labels": l, "masks": masks}) + return target + + +def make_crops(batch): + new_batch = [] + # detupling + for scene in batch: + new_batch.append([scene[0], scene[1], scene[2]]) + batch = new_batch + new_batch = [] + for scene in batch: + # move to center for better quadrant split + scene[0][:, :3] -= scene[0][:, :3].mean(0) + + # BUGFIX - there always would be a point in every quadrant + scene[0] = np.vstack( + ( + scene[0], + np.array( + [ + [0.1, 0.1, 0.1], + [0.1, -0.1, 0.1], + [-0.1, 0.1, 0.1], + [-0.1, -0.1, 0.1], + ] + ), + ) + ) + scene[1] = np.vstack((scene[1], np.zeros((4, scene[1].shape[1])))) + scene[2] = np.concatenate( + (scene[2], np.full_like((scene[2]), 255)[:4]) + ) + + crop = scene[0][:, 0] > 0 + crop &= scene[0][:, 1] > 0 + if crop.size > 1: + new_batch.append([scene[0][crop], scene[1][crop], scene[2][crop]]) + + crop = scene[0][:, 0] > 0 + crop &= scene[0][:, 1] < 0 + if crop.size > 1: + new_batch.append([scene[0][crop], scene[1][crop], scene[2][crop]]) + + crop = scene[0][:, 0] < 0 + crop &= scene[0][:, 1] > 0 + if crop.size > 1: + new_batch.append([scene[0][crop], scene[1][crop], scene[2][crop]]) + + crop = scene[0][:, 0] < 0 + crop &= scene[0][:, 1] < 0 + if crop.size > 1: + new_batch.append([scene[0][crop], scene[1][crop], scene[2][crop]]) + + # moving all of them to center + for i in range(len(new_batch)): + new_batch[i][0][:, :3] -= new_batch[i][0][:, :3].mean(0) + return new_batch + + +class NoGpu: + def __init__( + self, + coordinates, + features, + original_labels=None, + inverse_maps=None, + full_res_coords=None, + target_full=None, + original_colors=None, + original_normals=None, + original_coordinates=None, + idx=None, + ): + """helper class to prevent gpu loading on lightning""" + self.coordinates = coordinates + self.features = features + self.original_labels = original_labels + self.inverse_maps = inverse_maps + self.full_res_coords = full_res_coords + self.target_full = target_full + self.original_colors = original_colors + self.original_normals = original_normals + self.original_coordinates = original_coordinates + self.idx = idx + + +class NoGpuMask: + def __init__( + self, + coordinates, + features, + original_labels=None, + inverse_maps=None, + masks=None, + labels=None, + ): + """helper class to prevent gpu loading on lightning""" + self.coordinates = coordinates + self.features = features + self.original_labels = original_labels + self.inverse_maps = inverse_maps + + self.masks = masks + self.labels = labels diff --git a/models/Mask3D/mask3d/main_instance_segmentation.py b/models/Mask3D/mask3d/main_instance_segmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..c2664673cb3a1fa16191e7baa82a50bbb8f5f195 --- /dev/null +++ b/models/Mask3D/mask3d/main_instance_segmentation.py @@ -0,0 +1,114 @@ +import logging +import os +from hashlib import md5 +from uuid import uuid4 +import hydra +from dotenv import load_dotenv +from omegaconf import DictConfig, OmegaConf +from trainer.trainer import InstanceSegmentation, RegularCheckpointing +from pytorch_lightning.callbacks import ModelCheckpoint +from utils.utils import ( + flatten_dict, + load_baseline_model, + load_checkpoint_with_missing_or_exsessive_keys, + load_backbone_checkpoint_with_missing_or_exsessive_keys, +) +from pytorch_lightning import Trainer, seed_everything + + +def get_parameters(cfg: DictConfig): + logger = logging.getLogger(__name__) + load_dotenv(".env") + + # parsing input parameters + seed_everything(cfg.general.seed) + + # getting basic configuration + if cfg.general.get("gpus", None) is None: + cfg.general.gpus = os.environ.get("CUDA_VISIBLE_DEVICES", None) + loggers = [] + + # cfg.general.experiment_id = "0" # str(Repo("./").commit())[:8] + # params = flatten_dict(OmegaConf.to_container(cfg, resolve=True)) + + # create unique id for experiments that are run locally + # unique_id = "_" + str(uuid4())[:4] + # cfg.general.version = md5(str(params).encode("utf-8")).hexdigest()[:8] + unique_id + + if not os.path.exists(cfg.general.save_dir): + os.makedirs(cfg.general.save_dir) + else: + print("EXPERIMENT ALREADY EXIST") + cfg["trainer"][ + "resume_from_checkpoint" + ] = f"{cfg.general.save_dir}/last-epoch.ckpt" + + for log in cfg.logging: + print(log) + # loggers.append(hydra.utils.instantiate(log)) + # loggers[-1].log_hyperparams( + # flatten_dict(OmegaConf.to_container(cfg, resolve=True)) + # ) + + model = InstanceSegmentation(cfg) + if cfg.general.backbone_checkpoint is not None: + cfg, model = load_backbone_checkpoint_with_missing_or_exsessive_keys( + cfg, model + ) + if cfg.general.checkpoint is not None: + cfg, model = load_checkpoint_with_missing_or_exsessive_keys(cfg, model) + + logger.info(flatten_dict(OmegaConf.to_container(cfg, resolve=True))) + return cfg, model, loggers + + +@hydra.main( + config_path="conf", config_name="config_base_instance_segmentation.yaml" +) +def train(cfg: DictConfig): + os.chdir(hydra.utils.get_original_cwd()) + cfg, model, loggers = get_parameters(cfg) + callbacks = [] + for cb in cfg.callbacks: + callbacks.append(hydra.utils.instantiate(cb)) + + callbacks.append(RegularCheckpointing()) + + runner = Trainer( + logger=loggers, + gpus=cfg.general.gpus, + callbacks=callbacks, + weights_save_path=str(cfg.general.save_dir), + **cfg.trainer, + ) + runner.fit(model) + + +@hydra.main( + config_path="conf", config_name="config_base_instance_segmentation.yaml" +) +def test(cfg: DictConfig): + # because hydra wants to change dir for some reason + os.chdir(hydra.utils.get_original_cwd()) + cfg, model, loggers = get_parameters(cfg) + runner = Trainer( + gpus=cfg.general.gpus, + logger=loggers, + weights_save_path=str(cfg.general.save_dir), + **cfg.trainer, + ) + runner.test(model) + + +@hydra.main( + config_path="conf", config_name="config_base_instance_segmentation.yaml" +) +def main(cfg: DictConfig): + if cfg["general"]["train_mode"]: + train(cfg) + else: + test(cfg) + + +if __name__ == "__main__": + main() diff --git a/models/Mask3D/mask3d/models/__init__.py b/models/Mask3D/mask3d/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b092c965bba4c734b49a7f4d2e3ab6fee8471d17 --- /dev/null +++ b/models/Mask3D/mask3d/models/__init__.py @@ -0,0 +1,44 @@ +import mask3d.models.resunet as resunet +import mask3d.models.res16unet as res16unet +from mask3d.models.res16unet import ( + Res16UNet34C, + Res16UNet34A, + Res16UNet14A, + Res16UNet34D, + Res16UNet18D, + Res16UNet18B, + Custom30M, +) +from mask3d.models.mask3d import Mask3D + +MODELS = [] + + +def add_models(module): + MODELS.extend([getattr(module, a) for a in dir(module) if "Net" in a]) + + +add_models(resunet) +add_models(res16unet) +add_models(mask3d) + + +def get_models(): + """Returns a tuple of sample models.""" + return MODELS + + +def load_model(name): + """Creates and returns an instance of the model given its class name.""" + # Find the model class from its name + all_models = get_models() + mdict = {model.__name__: model for model in all_models} + if name not in mdict: + print("Invalid model index. Options are:") + # Display a list of valid model names + for model in all_models: + print(f"\t* {model.__name__}") + return None + NetClass = mdict[name] + + return NetClass diff --git a/models/Mask3D/mask3d/models/criterion.py b/models/Mask3D/mask3d/models/criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..19ce8bc8ecf4a0be08ce91e45857412a8d55efba --- /dev/null +++ b/models/Mask3D/mask3d/models/criterion.py @@ -0,0 +1,343 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py +# Modified for Mask3D +""" +MaskFormer criterion. +""" + +import torch +import torch.nn.functional as F +from torch import nn + +from detectron2.utils.comm import get_world_size +from detectron2.projects.point_rend.point_features import ( + get_uncertain_point_coords_with_randomness, + point_sample, +) + +from mask3d.models.misc import ( + is_dist_avail_and_initialized, + nested_tensor_from_tensor_list, +) + + +def dice_loss( + inputs: torch.Tensor, + targets: torch.Tensor, + num_masks: float, +): + """ + Compute the DICE loss, similar to generalized IOU for masks + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(-1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_masks + + +dice_loss_jit = torch.jit.script(dice_loss) # type: torch.jit.ScriptModule + + +def sigmoid_ce_loss( + inputs: torch.Tensor, + targets: torch.Tensor, + num_masks: float, +): + """ + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + Returns: + Loss tensor + """ + loss = F.binary_cross_entropy_with_logits( + inputs, targets, reduction="none" + ) + + return loss.mean(1).sum() / num_masks + + +sigmoid_ce_loss_jit = torch.jit.script( + sigmoid_ce_loss +) # type: torch.jit.ScriptModule + + +def calculate_uncertainty(logits): + """ + We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the + foreground class in `classes`. + Args: + logits (Tensor): A tensor of shape (R, 1, ...) for class-specific or + class-agnostic, where R is the total number of predicted masks in all images and C is + the number of foreground classes. The values are logits. + Returns: + scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with + the most uncertain locations having the highest uncertainty score. + """ + assert logits.shape[1] == 1 + gt_class_logits = logits.clone() + return -(torch.abs(gt_class_logits)) + + +class SetCriterion(nn.Module): + """This class computes the loss for DETR. + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + """ + + def __init__( + self, + num_classes, + matcher, + weight_dict, + eos_coef, + losses, + num_points, + oversample_ratio, + importance_sample_ratio, + class_weights, + ): + """Create the criterion. + Parameters: + num_classes: number of object categories, omitting the special no-object category + matcher: module able to compute a matching between targets and proposals + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category + losses: list of all the losses to be applied. See get_loss for list of available losses. + """ + super().__init__() + self.num_classes = num_classes - 1 + self.class_weights = class_weights + self.matcher = matcher + self.weight_dict = weight_dict + self.eos_coef = eos_coef + self.losses = losses + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = self.eos_coef + + if self.class_weights != -1: + assert ( + len(self.class_weights) == self.num_classes + ), "CLASS WEIGHTS DO NOT MATCH" + empty_weight[:-1] = torch.tensor(self.class_weights) + + self.register_buffer("empty_weight", empty_weight) + + # pointwise mask loss parameters + self.num_points = num_points + self.oversample_ratio = oversample_ratio + self.importance_sample_ratio = importance_sample_ratio + + def loss_labels(self, outputs, targets, indices, num_masks, mask_type): + """Classification loss (NLL) + targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] + """ + assert "pred_logits" in outputs + src_logits = outputs["pred_logits"].float() + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat( + [t["labels"][J] for t, (_, J) in zip(targets, indices)] + ) + target_classes = torch.full( + src_logits.shape[:2], + self.num_classes, + dtype=torch.int64, + device=src_logits.device, + ) + target_classes[idx] = target_classes_o + + loss_ce = F.cross_entropy( + src_logits.transpose(1, 2), + target_classes, + self.empty_weight, + ignore_index=253, + ) + losses = {"loss_ce": loss_ce} + return losses + + def loss_masks(self, outputs, targets, indices, num_masks, mask_type): + """Compute the losses related to the masks: the focal loss and the dice loss. + targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] + """ + assert "pred_masks" in outputs + + loss_masks = [] + loss_dices = [] + + for batch_id, (map_id, target_id) in enumerate(indices): + map = outputs["pred_masks"][batch_id][:, map_id].T + target_mask = targets[batch_id][mask_type][target_id] + + if self.num_points != -1: + point_idx = torch.randperm( + target_mask.shape[1], device=target_mask.device + )[: int(self.num_points * target_mask.shape[1])] + else: + # sample all points + point_idx = torch.arange( + target_mask.shape[1], device=target_mask.device + ) + + num_masks = target_mask.shape[0] + map = map[:, point_idx] + target_mask = target_mask[:, point_idx].float() + + loss_masks.append(sigmoid_ce_loss_jit(map, target_mask, num_masks)) + loss_dices.append(dice_loss_jit(map, target_mask, num_masks)) + # del target_mask + return { + "loss_mask": torch.sum(torch.stack(loss_masks)), + "loss_dice": torch.sum(torch.stack(loss_dices)), + } + + src_idx = self._get_src_permutation_idx(indices) + tgt_idx = self._get_tgt_permutation_idx(indices) + src_masks = outputs["pred_masks"] + src_masks = src_masks[src_idx] + masks = [t[mask_type] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(src_masks) + target_masks = target_masks[tgt_idx] + + # No need to upsample predictions as we are using normalized coordinates :) + # N x 1 x H x W + src_masks = src_masks[:, None] + target_masks = target_masks[:, None] + + with torch.no_grad(): + # sample point_coords + point_coords = get_uncertain_point_coords_with_randomness( + src_masks, + lambda logits: calculate_uncertainty(logits), + self.num_points, + self.oversample_ratio, + self.importance_sample_ratio, + ) + # get gt labels + point_labels = point_sample( + target_masks, + point_coords, + align_corners=False, + ).squeeze(1) + + point_logits = point_sample( + src_masks, + point_coords, + align_corners=False, + ).squeeze(1) + + losses = { + "loss_mask": sigmoid_ce_loss_jit( + point_logits, point_labels, num_masks, mask_type + ), + "loss_dice": dice_loss_jit( + point_logits, point_labels, num_masks, mask_type + ), + } + + del src_masks + del target_masks + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat( + [torch.full_like(src, i) for i, (src, _) in enumerate(indices)] + ) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat( + [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)] + ) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_masks, mask_type): + loss_map = {"labels": self.loss_labels, "masks": self.loss_masks} + assert loss in loss_map, f"do you really want to compute {loss} loss?" + return loss_map[loss](outputs, targets, indices, num_masks, mask_type) + + def forward(self, outputs, targets, mask_type): + """This performs the loss computation. + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = { + k: v for k, v in outputs.items() if k != "aux_outputs" + } + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets, mask_type) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_masks = sum(len(t["labels"]) for t in targets) + num_masks = torch.as_tensor( + [num_masks], + dtype=torch.float, + device=next(iter(outputs.values())).device, + ) + if is_dist_avail_and_initialized(): + torch.distributed.all_reduce(num_masks) + num_masks = torch.clamp(num_masks / get_world_size(), min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update( + self.get_loss( + loss, outputs, targets, indices, num_masks, mask_type + ) + ) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "aux_outputs" in outputs: + for i, aux_outputs in enumerate(outputs["aux_outputs"]): + indices = self.matcher(aux_outputs, targets, mask_type) + for loss in self.losses: + l_dict = self.get_loss( + loss, + aux_outputs, + targets, + indices, + num_masks, + mask_type, + ) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + def __repr__(self): + head = "Criterion " + self.__class__.__name__ + body = [ + "matcher: {}".format(self.matcher.__repr__(_repr_indent=8)), + "losses: {}".format(self.losses), + "weight_dict: {}".format(self.weight_dict), + "num_classes: {}".format(self.num_classes), + "eos_coef: {}".format(self.eos_coef), + "num_points: {}".format(self.num_points), + "oversample_ratio: {}".format(self.oversample_ratio), + "importance_sample_ratio: {}".format(self.importance_sample_ratio), + ] + _repr_indent = 4 + lines = [head] + [" " * _repr_indent + line for line in body] + return "\n".join(lines) diff --git a/models/Mask3D/mask3d/models/mask3d.py b/models/Mask3D/mask3d/models/mask3d.py new file mode 100644 index 0000000000000000000000000000000000000000..b7cd4c7a6a74b44df90bbd8d668c7def474f2b10 --- /dev/null +++ b/models/Mask3D/mask3d/models/mask3d.py @@ -0,0 +1,870 @@ +import torch +import hydra +import torch.nn as nn +import MinkowskiEngine.MinkowskiOps as me +from MinkowskiEngine.MinkowskiPooling import MinkowskiAvgPooling +import numpy as np +from torch.nn import functional as F +from mask3d.models.modules.common import conv +from mask3d.models.position_embedding import PositionEmbeddingCoordsSine +from mask3d.models.modules.helpers_3detr import GenericMLP +from torch_scatter import scatter_mean, scatter_max, scatter_min +from torch.cuda.amp import autocast + +from pointnet2.pointnet2_utils import furthest_point_sample + + +class Mask3D(nn.Module): + def __init__( + self, + config, + hidden_dim, + num_queries, + num_heads, + dim_feedforward, + sample_sizes, + shared_decoder, + num_classes, + num_decoders, + dropout, + pre_norm, + positional_encoding_type, + non_parametric_queries, + train_on_segments, + normalize_pos_enc, + use_level_embed, + scatter_type, + hlevels, + use_np_features, + voxel_size, + max_sample_size, + random_queries, + gauss_scale, + random_query_both, + random_normal, + ): + super().__init__() + self.random_normal = random_normal + self.random_query_both = random_query_both + self.random_queries = random_queries + self.max_sample_size = max_sample_size + self.gauss_scale = gauss_scale + self.voxel_size = voxel_size + self.scatter_type = scatter_type + self.hlevels = hlevels + self.use_level_embed = use_level_embed + self.train_on_segments = train_on_segments + self.normalize_pos_enc = normalize_pos_enc + self.num_decoders = num_decoders + self.num_classes = num_classes + self.dropout = dropout + self.pre_norm = pre_norm + self.shared_decoder = shared_decoder + self.sample_sizes = sample_sizes + self.non_parametric_queries = non_parametric_queries + self.use_np_features = use_np_features + self.mask_dim = hidden_dim + self.num_heads = num_heads + self.num_queries = num_queries + self.pos_enc_type = positional_encoding_type + + self.backbone = hydra.utils.instantiate(config.backbone) + self.num_levels = len(self.hlevels) + sizes = self.backbone.PLANES[-5:] + + self.mask_features_head = conv( + self.backbone.PLANES[7], + self.mask_dim, + kernel_size=1, + stride=1, + bias=True, + D=3, + ) + + if self.scatter_type == "mean": + self.scatter_fn = scatter_mean + elif self.scatter_type == "max": + self.scatter_fn = lambda mask, p2s, dim: scatter_max( + mask, p2s, dim=dim + )[0] + else: + assert False, "Scatter function not known" + + assert ( + not use_np_features + ) or non_parametric_queries, "np features only with np queries" + + if self.non_parametric_queries: + self.query_projection = GenericMLP( + input_dim=self.mask_dim, + hidden_dims=[self.mask_dim], + output_dim=self.mask_dim, + use_conv=True, + output_use_activation=True, + hidden_use_bias=True, + ) + + if self.use_np_features: + self.np_feature_projection = nn.Sequential( + nn.Linear(sizes[-1], hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + ) + elif self.random_query_both: + self.query_projection = GenericMLP( + input_dim=2 * self.mask_dim, + hidden_dims=[2 * self.mask_dim], + output_dim=2 * self.mask_dim, + use_conv=True, + output_use_activation=True, + hidden_use_bias=True, + ) + else: + # PARAMETRIC QUERIES + # learnable query features + self.query_feat = nn.Embedding(num_queries, hidden_dim) + # learnable query p.e. + self.query_pos = nn.Embedding(num_queries, hidden_dim) + + if self.use_level_embed: + # learnable scale-level embedding + self.level_embed = nn.Embedding(self.num_levels, hidden_dim) + + self.mask_embed_head = nn.Sequential( + nn.Linear(hidden_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, hidden_dim), + ) + + self.class_embed_head = nn.Linear(hidden_dim, self.num_classes) + + if self.pos_enc_type == "legacy": + self.pos_enc = PositionalEncoding3D(channels=self.mask_dim) + elif self.pos_enc_type == "fourier": + self.pos_enc = PositionEmbeddingCoordsSine( + pos_type="fourier", + d_pos=self.mask_dim, + gauss_scale=self.gauss_scale, + normalize=self.normalize_pos_enc, + ) + elif self.pos_enc_type == "sine": + self.pos_enc = PositionEmbeddingCoordsSine( + pos_type="sine", + d_pos=self.mask_dim, + normalize=self.normalize_pos_enc, + ) + else: + assert False, "pos enc type not known" + + self.pooling = MinkowskiAvgPooling( + kernel_size=2, stride=2, dimension=3 + ) + + self.masked_transformer_decoder = nn.ModuleList() + self.cross_attention = nn.ModuleList() + self.self_attention = nn.ModuleList() + self.ffn_attention = nn.ModuleList() + self.lin_squeeze = nn.ModuleList() + + num_shared = self.num_decoders if not self.shared_decoder else 1 + + for _ in range(num_shared): + tmp_cross_attention = nn.ModuleList() + tmp_self_attention = nn.ModuleList() + tmp_ffn_attention = nn.ModuleList() + tmp_squeeze_attention = nn.ModuleList() + for i, hlevel in enumerate(self.hlevels): + tmp_cross_attention.append( + CrossAttentionLayer( + d_model=self.mask_dim, + nhead=self.num_heads, + dropout=self.dropout, + normalize_before=self.pre_norm, + ) + ) + + tmp_squeeze_attention.append( + nn.Linear(sizes[hlevel], self.mask_dim) + ) + + tmp_self_attention.append( + SelfAttentionLayer( + d_model=self.mask_dim, + nhead=self.num_heads, + dropout=self.dropout, + normalize_before=self.pre_norm, + ) + ) + + tmp_ffn_attention.append( + FFNLayer( + d_model=self.mask_dim, + dim_feedforward=dim_feedforward, + dropout=self.dropout, + normalize_before=self.pre_norm, + ) + ) + + self.cross_attention.append(tmp_cross_attention) + self.self_attention.append(tmp_self_attention) + self.ffn_attention.append(tmp_ffn_attention) + self.lin_squeeze.append(tmp_squeeze_attention) + + self.decoder_norm = nn.LayerNorm(hidden_dim) + + def get_pos_encs(self, coords): + pos_encodings_pcd = [] + + for i in range(len(coords)): + pos_encodings_pcd.append([[]]) + for coords_batch in coords[i].decomposed_features: + scene_min = coords_batch.min(dim=0)[0][None, ...] + scene_max = coords_batch.max(dim=0)[0][None, ...] + + with autocast(enabled=False): + tmp = self.pos_enc( + coords_batch[None, ...].float(), + input_range=[scene_min, scene_max], + ) + + pos_encodings_pcd[-1][0].append(tmp.squeeze(0).permute((1, 0))) + + return pos_encodings_pcd + + def forward( + self, x, point2segment=None, raw_coordinates=None, is_eval=False + ): + # print(x) + pcd_features, aux = self.backbone(x) + + batch_size = len(x.decomposed_coordinates) + + with torch.no_grad(): + coordinates = me.SparseTensor( + features=raw_coordinates, + coordinate_manager=aux[-1].coordinate_manager, + coordinate_map_key=aux[-1].coordinate_map_key, + device=aux[-1].device, + ) + + coords = [coordinates] + for _ in reversed(range(len(aux) - 1)): + coords.append(self.pooling(coords[-1])) + + coords.reverse() + + pos_encodings_pcd = self.get_pos_encs(coords) + mask_features = self.mask_features_head(pcd_features) + if point2segment is not None: + mask_segments = [] + for i, mask_feature in enumerate( + mask_features.decomposed_features + ): + mask_segments.append( + self.scatter_fn(mask_feature, point2segment[i], dim=0) + ) + + sampled_coords = None + + if self.non_parametric_queries: + fps_idx = [ + furthest_point_sample( + x.decomposed_coordinates[i][None, ...].float(), + self.num_queries, + ) + .squeeze(0) + .long() + for i in range(len(x.decomposed_coordinates)) + ] + + sampled_coords = torch.stack( + [ + coordinates.decomposed_features[i][fps_idx[i].long(), :] + for i in range(len(fps_idx)) + ] + ) + + mins = torch.stack( + [ + coordinates.decomposed_features[i].min(dim=0)[0] + for i in range(len(coordinates.decomposed_features)) + ] + ) + maxs = torch.stack( + [ + coordinates.decomposed_features[i].max(dim=0)[0] + for i in range(len(coordinates.decomposed_features)) + ] + ) + + query_pos = self.pos_enc( + sampled_coords.float(), input_range=[mins, maxs] + ) # Batch, Dim, queries + query_pos = self.query_projection(query_pos) + + if not self.use_np_features: + queries = torch.zeros_like(query_pos).permute((0, 2, 1)) + else: + queries = torch.stack( + [ + pcd_features.decomposed_features[i][ + fps_idx[i].long(), : + ] + for i in range(len(fps_idx)) + ] + ) + queries = self.np_feature_projection(queries) + query_pos = query_pos.permute((2, 0, 1)) + elif self.random_queries: + query_pos = ( + torch.rand( + batch_size, + self.mask_dim, + self.num_queries, + device=x.device, + ) + - 0.5 + ) + + queries = torch.zeros_like(query_pos).permute((0, 2, 1)) + query_pos = query_pos.permute((2, 0, 1)) + elif self.random_query_both: + if not self.random_normal: + query_pos_feat = ( + torch.rand( + batch_size, + 2 * self.mask_dim, + self.num_queries, + device=x.device, + ) + - 0.5 + ) + else: + query_pos_feat = torch.randn( + batch_size, + 2 * self.mask_dim, + self.num_queries, + device=x.device, + ) + + queries = query_pos_feat[:, : self.mask_dim, :].permute((0, 2, 1)) + query_pos = query_pos_feat[:, self.mask_dim :, :].permute( + (2, 0, 1) + ) + else: + # PARAMETRIC QUERIES + queries = self.query_feat.weight.unsqueeze(0).repeat( + batch_size, 1, 1 + ) + query_pos = self.query_pos.weight.unsqueeze(1).repeat( + 1, batch_size, 1 + ) + + predictions_class = [] + predictions_mask = [] + + for decoder_counter in range(self.num_decoders): + if self.shared_decoder: + decoder_counter = 0 + for i, hlevel in enumerate(self.hlevels): + if point2segment is not None: + output_class, outputs_mask, attn_mask = self.mask_module( + queries, + mask_features, + mask_segments, + len(aux) - hlevel - 1, + ret_attn_mask=True, + point2segment=point2segment, + coords=coords, + ) + else: + output_class, outputs_mask, attn_mask = self.mask_module( + queries, + mask_features, + None, + len(aux) - hlevel - 1, + ret_attn_mask=True, + point2segment=None, + coords=coords, + ) + + decomposed_aux = aux[hlevel].decomposed_features + decomposed_attn = attn_mask.decomposed_features + + curr_sample_size = max( + [pcd.shape[0] for pcd in decomposed_aux] + ) + + if min([pcd.shape[0] for pcd in decomposed_aux]) == 1: + raise RuntimeError( + "only a single point gives nans in cross-attention" + ) + + if not (self.max_sample_size or is_eval): + curr_sample_size = min( + curr_sample_size, self.sample_sizes[hlevel] + ) + + rand_idx = [] + mask_idx = [] + for k in range(len(decomposed_aux)): + pcd_size = decomposed_aux[k].shape[0] + if pcd_size <= curr_sample_size: + # we do not need to sample + # take all points and pad the rest with zeroes and mask it + idx = torch.zeros( + curr_sample_size, + dtype=torch.long, + device=queries.device, + ) + + midx = torch.ones( + curr_sample_size, + dtype=torch.bool, + device=queries.device, + ) + + idx[:pcd_size] = torch.arange( + pcd_size, device=queries.device + ) + + midx[:pcd_size] = False # attend to first points + else: + # we have more points in pcd as we like to sample + # take a subset (no padding or masking needed) + idx = torch.randperm( + decomposed_aux[k].shape[0], device=queries.device + )[:curr_sample_size] + midx = torch.zeros( + curr_sample_size, + dtype=torch.bool, + device=queries.device, + ) # attend to all + + rand_idx.append(idx) + mask_idx.append(midx) + + batched_aux = torch.stack( + [ + decomposed_aux[k][rand_idx[k], :] + for k in range(len(rand_idx)) + ] + ) + + batched_attn = torch.stack( + [ + decomposed_attn[k][rand_idx[k], :] + for k in range(len(rand_idx)) + ] + ) + + batched_pos_enc = torch.stack( + [ + pos_encodings_pcd[hlevel][0][k][rand_idx[k], :] + for k in range(len(rand_idx)) + ] + ) + + batched_attn.permute((0, 2, 1))[ + batched_attn.sum(1) == rand_idx[0].shape[0] + ] = False + + m = torch.stack(mask_idx) + batched_attn = torch.logical_or(batched_attn, m[..., None]) + + src_pcd = self.lin_squeeze[decoder_counter][i]( + batched_aux.permute((1, 0, 2)) + ) + if self.use_level_embed: + src_pcd += self.level_embed.weight[i] + + output = self.cross_attention[decoder_counter][i]( + queries.permute((1, 0, 2)), + src_pcd, + memory_mask=batched_attn.repeat_interleave( + self.num_heads, dim=0 + ).permute((0, 2, 1)), + memory_key_padding_mask=None, # here we do not apply masking on padded region + pos=batched_pos_enc.permute((1, 0, 2)), + query_pos=query_pos, + ) + + output = self.self_attention[decoder_counter][i]( + output, + tgt_mask=None, + tgt_key_padding_mask=None, + query_pos=query_pos, + ) + + # FFN + queries = self.ffn_attention[decoder_counter][i]( + output + ).permute((1, 0, 2)) + + predictions_class.append(output_class) + predictions_mask.append(outputs_mask) + + if point2segment is not None: + output_class, outputs_mask = self.mask_module( + queries, + mask_features, + mask_segments, + 0, + ret_attn_mask=False, + point2segment=point2segment, + coords=coords, + ) + else: + output_class, outputs_mask = self.mask_module( + queries, + mask_features, + None, + 0, + ret_attn_mask=False, + point2segment=None, + coords=coords, + ) + predictions_class.append(output_class) + predictions_mask.append(outputs_mask) + + return { + "pred_logits": predictions_class[-1], + "pred_masks": predictions_mask[-1], + "aux_outputs": self._set_aux_loss( + predictions_class, predictions_mask + ), + "sampled_coords": sampled_coords.detach().cpu().numpy() + if sampled_coords is not None + else None, + "backbone_features": pcd_features, + } + + def mask_module( + self, + query_feat, + mask_features, + mask_segments, + num_pooling_steps, + ret_attn_mask=True, + point2segment=None, + coords=None, + ): + query_feat = self.decoder_norm(query_feat) + mask_embed = self.mask_embed_head(query_feat) + outputs_class = self.class_embed_head(query_feat) + + output_masks = [] + + if point2segment is not None: + output_segments = [] + for i in range(len(mask_segments)): + output_segments.append(mask_segments[i] @ mask_embed[i].T) + output_masks.append(output_segments[-1][point2segment[i]]) + else: + for i in range(mask_features.C[-1, 0] + 1): + output_masks.append( + mask_features.decomposed_features[i] @ mask_embed[i].T + ) + + output_masks = torch.cat(output_masks) + outputs_mask = me.SparseTensor( + features=output_masks, + coordinate_manager=mask_features.coordinate_manager, + coordinate_map_key=mask_features.coordinate_map_key, + ) + + if ret_attn_mask: + attn_mask = outputs_mask + for _ in range(num_pooling_steps): + attn_mask = self.pooling(attn_mask.float()) + + attn_mask = me.SparseTensor( + features=(attn_mask.F.detach().sigmoid() < 0.5), + coordinate_manager=attn_mask.coordinate_manager, + coordinate_map_key=attn_mask.coordinate_map_key, + ) + + if point2segment is not None: + return outputs_class, output_segments, attn_mask + else: + return ( + outputs_class, + outputs_mask.decomposed_features, + attn_mask, + ) + + if point2segment is not None: + return outputs_class, output_segments + else: + return outputs_class, outputs_mask.decomposed_features + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_seg_masks): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [ + {"pred_logits": a, "pred_masks": b} + for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) + ] + + +class PositionalEncoding3D(nn.Module): + def __init__(self, channels): + """ + :param channels: The last dimension of the tensor you want to apply pos emb to. + """ + self.orig_ch = channels + super(PositionalEncoding3D, self).__init__() + channels = int(np.ceil(channels / 6) * 2) + if channels % 2: + channels += 1 + self.channels = channels + inv_freq = 1.0 / ( + 10000 ** (torch.arange(0, channels, 2).float() / channels) + ) + self.register_buffer("inv_freq", inv_freq) + + def forward(self, tensor, input_range=None): + """ + :param tensor: A 5d tensor of size (batch_size, x, y, z, ch) + :return: Positional Encoding Matrix of size (batch_size, x, y, z, ch) + """ + pos_x, pos_y, pos_z = tensor[:, :, 0], tensor[:, :, 1], tensor[:, :, 2] + sin_inp_x = torch.einsum("bi,j->bij", pos_x, self.inv_freq) + sin_inp_y = torch.einsum("bi,j->bij", pos_y, self.inv_freq) + sin_inp_z = torch.einsum("bi,j->bij", pos_z, self.inv_freq) + emb_x = torch.cat((sin_inp_x.sin(), sin_inp_x.cos()), dim=-1) + + emb_y = torch.cat((sin_inp_y.sin(), sin_inp_y.cos()), dim=-1) + emb_z = torch.cat((sin_inp_z.sin(), sin_inp_z.cos()), dim=-1) + + emb = torch.cat((emb_x, emb_y, emb_z), dim=-1) + return emb[:, :, : self.orig_ch].permute((0, 2, 1)) + + +class SelfAttentionLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dropout=0.0, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + + self.norm = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_post( + self, tgt, tgt_mask=None, tgt_key_padding_mask=None, query_pos=None + ): + q = k = self.with_pos_embed(tgt, query_pos) + tgt2 = self.self_attn( + q, + k, + value=tgt, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask, + )[0] + tgt = tgt + self.dropout(tgt2) + tgt = self.norm(tgt) + + return tgt + + def forward_pre( + self, tgt, tgt_mask=None, tgt_key_padding_mask=None, query_pos=None + ): + tgt2 = self.norm(tgt) + q = k = self.with_pos_embed(tgt2, query_pos) + tgt2 = self.self_attn( + q, + k, + value=tgt2, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask, + )[0] + tgt = tgt + self.dropout(tgt2) + + return tgt + + def forward( + self, tgt, tgt_mask=None, tgt_key_padding_mask=None, query_pos=None + ): + if self.normalize_before: + return self.forward_pre( + tgt, tgt_mask, tgt_key_padding_mask, query_pos + ) + return self.forward_post( + tgt, tgt_mask, tgt_key_padding_mask, query_pos + ) + + +class CrossAttentionLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dropout=0.0, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.multihead_attn = nn.MultiheadAttention( + d_model, nhead, dropout=dropout + ) + + self.norm = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_post( + self, + tgt, + memory, + memory_mask=None, + memory_key_padding_mask=None, + pos=None, + query_pos=None, + ): + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask, + )[0] + tgt = tgt + self.dropout(tgt2) + tgt = self.norm(tgt) + + return tgt + + def forward_pre( + self, + tgt, + memory, + memory_mask=None, + memory_key_padding_mask=None, + pos=None, + query_pos=None, + ): + tgt2 = self.norm(tgt) + + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt2, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask, + )[0] + tgt = tgt + self.dropout(tgt2) + + return tgt + + def forward( + self, + tgt, + memory, + memory_mask=None, + memory_key_padding_mask=None, + pos=None, + query_pos=None, + ): + if self.normalize_before: + return self.forward_pre( + tgt, + memory, + memory_mask, + memory_key_padding_mask, + pos, + query_pos, + ) + return self.forward_post( + tgt, memory, memory_mask, memory_key_padding_mask, pos, query_pos + ) + + +class FFNLayer(nn.Module): + def __init__( + self, + d_model, + dim_feedforward=2048, + dropout=0.0, + activation="relu", + normalize_before=False, + ): + super().__init__() + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm = nn.LayerNorm(d_model) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_post(self, tgt): + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout(tgt2) + tgt = self.norm(tgt) + return tgt + + def forward_pre(self, tgt): + tgt2 = self.norm(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout(tgt2) + return tgt + + def forward(self, tgt): + if self.normalize_before: + return self.forward_pre(tgt) + return self.forward_post(tgt) + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") diff --git a/models/Mask3D/mask3d/models/matcher.py b/models/Mask3D/mask3d/models/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..fc0e7a05bb76a078b1c3c3b9c877054e439b584c --- /dev/null +++ b/models/Mask3D/mask3d/models/matcher.py @@ -0,0 +1,226 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py +""" +Modules to compute the matching cost and solve the corresponding LSAP. +""" +import torch +import torch.nn.functional as F +from scipy.optimize import linear_sum_assignment +from torch import nn +from torch.cuda.amp import autocast + +from detectron2.projects.point_rend.point_features import point_sample + + +def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor): + """ + Compute the DICE loss, similar to generalized IOU for masks + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) + denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] + loss = 1 - (numerator + 1) / (denominator + 1) + return loss + + +batch_dice_loss_jit = torch.jit.script( + batch_dice_loss +) # type: torch.jit.ScriptModule + + +def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor): + """ + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + Returns: + Loss tensor + """ + hw = inputs.shape[1] + + pos = F.binary_cross_entropy_with_logits( + inputs, torch.ones_like(inputs), reduction="none" + ) + neg = F.binary_cross_entropy_with_logits( + inputs, torch.zeros_like(inputs), reduction="none" + ) + + loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( + "nc,mc->nm", neg, (1 - targets) + ) + + return loss / hw + + +batch_sigmoid_ce_loss_jit = torch.jit.script( + batch_sigmoid_ce_loss +) # type: torch.jit.ScriptModule + + +class HungarianMatcher(nn.Module): + """This class computes an assignment between the targets and the predictions of the network + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, + there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, + while the others are un-matched (and thus treated as non-objects). + """ + + def __init__( + self, + cost_class: float = 1, + cost_mask: float = 1, + cost_dice: float = 1, + num_points: int = 0, + ): + """Creates the matcher + + Params: + cost_class: This is the relative weight of the classification error in the matching cost + cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost + cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost + """ + super().__init__() + self.cost_class = cost_class + self.cost_mask = cost_mask + self.cost_dice = cost_dice + + assert ( + cost_class != 0 or cost_mask != 0 or cost_dice != 0 + ), "all costs cant be 0" + + self.num_points = num_points + + @torch.no_grad() + def memory_efficient_forward(self, outputs, targets, mask_type): + """More memory-friendly matching""" + bs, num_queries = outputs["pred_logits"].shape[:2] + + indices = [] + + # Iterate through batch size + for b in range(bs): + + out_prob = outputs["pred_logits"][b].softmax( + -1 + ) # [num_queries, num_classes] + tgt_ids = targets[b]["labels"].clone() + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + filter_ignore = tgt_ids == 253 + tgt_ids[filter_ignore] = 0 + cost_class = -out_prob[:, tgt_ids] + cost_class[ + :, filter_ignore + ] = ( + -1.0 + ) # for ignore classes pretend perfect match ;) TODO better worst class match? + + out_mask = outputs["pred_masks"][ + b + ].T # [num_queries, H_pred, W_pred] + # gt masks are already padded when preparing target + tgt_mask = targets[b][mask_type].to(out_mask) + + if self.num_points != -1: + point_idx = torch.randperm( + tgt_mask.shape[1], device=tgt_mask.device + )[: int(self.num_points * tgt_mask.shape[1])] + # point_idx = torch.randint(0, tgt_mask.shape[1], size=(self.num_points,), device=tgt_mask.device) + else: + # sample all points + point_idx = torch.arange( + tgt_mask.shape[1], device=tgt_mask.device + ) + + # out_mask = out_mask[:, None] + # tgt_mask = tgt_mask[:, None] + # all masks share the same set of points for efficient matching! + # point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device) + # get gt labels + # tgt_mask = point_sample( + # tgt_mask, + # point_coords.repeat(tgt_mask.shape[0], 1, 1), + # align_corners=False, + # ).squeeze(1) + + # out_mask = point_sample( + # out_mask, + # point_coords.repeat(out_mask.shape[0], 1, 1), + # align_corners=False, + # ).squeeze(1) + + with autocast(enabled=False): + out_mask = out_mask.float() + tgt_mask = tgt_mask.float() + # Compute the focal loss between masks + cost_mask = batch_sigmoid_ce_loss_jit( + out_mask[:, point_idx], tgt_mask[:, point_idx] + ) + + # Compute the dice loss betwen masks + cost_dice = batch_dice_loss_jit( + out_mask[:, point_idx], tgt_mask[:, point_idx] + ) + + # Final cost matrix + C = ( + self.cost_mask * cost_mask + + self.cost_class * cost_class + + self.cost_dice * cost_dice + ) + C = C.reshape(num_queries, -1).cpu() + + indices.append(linear_sum_assignment(C)) + + return [ + ( + torch.as_tensor(i, dtype=torch.int64), + torch.as_tensor(j, dtype=torch.int64), + ) + for i, j in indices + ] + + @torch.no_grad() + def forward(self, outputs, targets, mask_type): + """Performs the matching + + Params: + outputs: This is a dict that contains at least these entries: + "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks + + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels + "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + return self.memory_efficient_forward(outputs, targets, mask_type) + + def __repr__(self, _repr_indent=4): + head = "Matcher " + self.__class__.__name__ + body = [ + "cost_class: {}".format(self.cost_class), + "cost_mask: {}".format(self.cost_mask), + "cost_dice: {}".format(self.cost_dice), + ] + lines = [head] + [" " * _repr_indent + line for line in body] + return "\n".join(lines) diff --git a/models/Mask3D/mask3d/models/metrics/__init__.py b/models/Mask3D/mask3d/models/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bd7538b5868b93e4192dbee9ca0da9e91323cf0f --- /dev/null +++ b/models/Mask3D/mask3d/models/metrics/__init__.py @@ -0,0 +1,4 @@ +from .confusionmatrix import ConfusionMatrix +from .metrics import IoU + +__all__ = ["ConfusionMatrix", "IoU"] diff --git a/models/Mask3D/mask3d/models/metrics/confusionmatrix.py b/models/Mask3D/mask3d/models/metrics/confusionmatrix.py new file mode 100644 index 0000000000000000000000000000000000000000..2d92f12595d26f76f3c26d18550b1b1486b837ff --- /dev/null +++ b/models/Mask3D/mask3d/models/metrics/confusionmatrix.py @@ -0,0 +1,107 @@ +import numpy as np +import torch + + +class ConfusionMatrix: + """Constructs a confusion matrix for a multi-class classification problems. + + Does not support multi-label, multi-class problems. + + Keyword arguments: + - num_classes (int): number of classes in the classification problem. + - normalized (boolean, optional): Determines whether or not the confusion + matrix is normalized or not. Default: False. + + Modified from: https://github.com/pytorch/tnt/blob/master/torchnet/meter/confusionmeter.py + """ + + def __init__(self, num_classes, ignore_label): + super().__init__() + + self.conf = np.ndarray((num_classes, num_classes), dtype=np.int32) + self.ignore_label = ignore_label + self.num_classes = num_classes + self.reset() + + def reset(self): + self.conf.fill(0) + + def add(self, predicted, target): + """Computes the confusion matrix + + The shape of the confusion matrix is K x K, where K is the number + of classes. + + Keyword arguments: + - predicted (Tensor or numpy.ndarray): Can be an N x K tensor/array of + predicted scores obtained from the model for N examples and K classes, + or an N-tensor/array of integer values between 0 and K-1. + - target (Tensor or numpy.ndarray): Can be an N x K tensor/array of + ground-truth classes for N examples and K classes, or an N-tensor/array + of integer values between 0 and K-1. + + """ + # _, predicted = predicted.max(1) + + # predicted = predicted.view(-1) + # target = target.view(-1) + + # If target and/or predicted are tensors, convert them to numpy arrays + if torch.is_tensor(predicted): + predicted = predicted.cpu().numpy() + if torch.is_tensor(target): + target = target.cpu().numpy() + ind = ~np.isin(target, self.ignore_label) + predicted, target = predicted[ind], target[ind] + + assert ( + predicted.shape[0] == target.shape[0] + ), "number of targets and predicted outputs do not match" + + if np.ndim(predicted) != 1: + assert ( + predicted.shape[1] == self.num_classes + ), "number of predictions does not match size of confusion matrix" + predicted = np.argmax(predicted, 1) + else: + assert (predicted.max() < self.num_classes) and ( + predicted.min() >= 0 + ), "predicted values are not between 0 and k-1" + + if np.ndim(target) != 1: + assert ( + target.shape[1] == self.num_classes + ), "Onehot target does not match size of confusion matrix" + assert (target >= 0).all() and ( + target <= 1 + ).all(), "in one-hot encoding, target values should be 0 or 1" + assert ( + target.sum(1) == 1 + ).all(), "multi-label setting is not supported" + target = np.argmax(target, 1) + else: + assert (target.max() < self.num_classes) and ( + target.min() >= 0 + ), "target values are not between 0 and k-1" + + # hack for bincounting 2 arrays together + x = predicted + self.num_classes * target + bincount_2d = np.bincount( + x.astype(np.int32), minlength=self.num_classes**2 + ) + assert bincount_2d.size == self.num_classes**2 + conf = bincount_2d.reshape((self.num_classes, self.num_classes)) + + self.conf += conf + + def value(self, normalized=False): + """ + Returns: + Confustion matrix of K rows and K columns, where rows corresponds + to ground-truth targets and columns corresponds to predicted + targets. + """ + if normalized: + conf = self.conf.astype(np.float32) + return conf / conf.sum(1).clip(min=1e-12)[:, None] + return self.conf diff --git a/models/Mask3D/mask3d/models/metrics/metrics.py b/models/Mask3D/mask3d/models/metrics/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..f3f4b0ca4f7b0c5224ea242f459374a28485539f --- /dev/null +++ b/models/Mask3D/mask3d/models/metrics/metrics.py @@ -0,0 +1,48 @@ +import numpy as np + + +class IoU: + """Computes the intersection over union (IoU) per class and corresponding + mean (mIoU). + + Intersection over union (IoU) is a common evaluation metric for semantic + segmentation. The predictions are first accumulated in a confusion matrix + and the IoU is computed from it as follows: + + IoU = true_positive / (true_positive + false_positive + false_negative). + + Keyword arguments: + - num_classes (int): number of classes in the classification problem + - normalized (boolean, optional): Determines whether or not the confusion + matrix is normalized or not. Default: False. + - ignore_index (int or iterable, optional): Index of the classes to ignore + when computing the IoU. Can be an int, or any iterable of ints. + + Modified from: https://github.com/pytorch/tnt/blob/master/torchnet/meter + + """ + + def __init__(self): + super().__init__() + + def value(self, conf_matrix): + """Computes the IoU and mean IoU. + + The mean computation ignores NaN elements of the IoU array. + + Returns: + Tuple: (IoU, mIoU). The first output is the per class IoU, + for K classes it's numpy.ndarray with K elements. The second output, + is the mean IoU. + """ + true_positive = np.diag(conf_matrix) + false_positive = np.sum(conf_matrix, 0) - true_positive + false_negative = np.sum(conf_matrix, 1) - true_positive + + # Just in case we get a division by 0, ignore/hide the error + with np.errstate(divide="ignore", invalid="ignore"): + iou = true_positive / ( + true_positive + false_positive + false_negative + ) + + return iou diff --git a/models/Mask3D/mask3d/models/misc.py b/models/Mask3D/mask3d/models/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..8416b62804fbc002bd02a457d896276bc307b070 --- /dev/null +++ b/models/Mask3D/mask3d/models/misc.py @@ -0,0 +1,119 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py +""" +Misc functions, including distributed helpers. + +Mostly copy-paste from torchvision references. +""" +from typing import List, Optional + +import torch +import torch.distributed as dist +import torchvision +from torch import Tensor + + +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + # type: (Device) -> NestedTensor # noqa + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + assert mask is not None + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + # TODO make this more general + if tensor_list[0].ndim == 3: + if torchvision._is_tracing(): + # nested_tensor_from_tensor_list() does not export well to ONNX + # call _onnx_nested_tensor_from_tensor_list() instead + return _onnx_nested_tensor_from_tensor_list(tensor_list) + + # TODO make it support different-sized images + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) + batch_shape = [len(tensor_list)] + max_size + b, c, h, w = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((b, h, w), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("not supported") + return NestedTensor(tensor, mask) + + +# _onnx_nested_tensor_from_tensor_list() is an implementation of +# nested_tensor_from_tensor_list() that is supported by ONNX tracing. +@torch.jit.unused +def _onnx_nested_tensor_from_tensor_list( + tensor_list: List[Tensor], +) -> NestedTensor: + max_size = [] + for i in range(tensor_list[0].dim()): + max_size_i = torch.max( + torch.stack([img.shape[i] for img in tensor_list]).to( + torch.float32 + ) + ).to(torch.int64) + max_size.append(max_size_i) + max_size = tuple(max_size) + + # work around for + # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + # m[: img.shape[1], :img.shape[2]] = False + # which is not yet supported in onnx + padded_imgs = [] + padded_masks = [] + for img in tensor_list: + padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] + padded_img = torch.nn.functional.pad( + img, (0, padding[2], 0, padding[1], 0, padding[0]) + ) + padded_imgs.append(padded_img) + + m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) + padded_mask = torch.nn.functional.pad( + m, (0, padding[2], 0, padding[1]), "constant", 1 + ) + padded_masks.append(padded_mask.to(torch.bool)) + + tensor = torch.stack(padded_imgs) + mask = torch.stack(padded_masks) + + return NestedTensor(tensor, mask=mask) + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True diff --git a/models/Mask3D/mask3d/models/model.py b/models/Mask3D/mask3d/models/model.py new file mode 100644 index 0000000000000000000000000000000000000000..d167fa58358f2c1a7ca4a509e38c61906e9dd7ac --- /dev/null +++ b/models/Mask3D/mask3d/models/model.py @@ -0,0 +1,27 @@ +from MinkowskiEngine import MinkowskiNetwork + + +class Model(MinkowskiNetwork): + """ + Base network for all sparse convnet + + By default, all networks are segmentation networks. + """ + + OUT_PIXEL_DIST = -1 + + def __init__(self, in_channels, out_channels, config, D, **kwargs): + super().__init__(D) + self.in_channels = in_channels + self.out_channels = out_channels + self.config = config + + +class HighDimensionalModel(Model): + """ + Base network for all spatio (temporal) chromatic sparse convnet + """ + + def __init__(self, in_channels, out_channels, config, D, **kwargs): + assert D > 4, "Num dimension smaller than 5" + super().__init__(in_channels, out_channels, config, D, **kwargs) diff --git a/models/Mask3D/mask3d/models/modules/3detr_helpers.py b/models/Mask3D/mask3d/models/modules/3detr_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..2c3f7ea57c0266a9781cdfec9f59896d15750a9d --- /dev/null +++ b/models/Mask3D/mask3d/models/modules/3detr_helpers.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import torch.nn as nn +from functools import partial +import copy + + +class BatchNormDim1Swap(nn.BatchNorm1d): + """ + Used for nn.Transformer that uses a HW x N x C rep + """ + + def forward(self, x): + """ + x: HW x N x C + permute to N x C x HW + Apply BN on C + permute back + """ + hw, n, c = x.shape + x = x.permute(1, 2, 0) + x = super(BatchNormDim1Swap, self).forward(x) + # x: n x c x hw -> hw x n x c + x = x.permute(2, 0, 1) + return x + + +NORM_DICT = { + "bn": BatchNormDim1Swap, + "bn1d": nn.BatchNorm1d, + "id": nn.Identity, + "ln": nn.LayerNorm, +} + +ACTIVATION_DICT = { + "relu": nn.ReLU, + "gelu": nn.GELU, + "leakyrelu": partial(nn.LeakyReLU, negative_slope=0.1), +} + +WEIGHT_INIT_DICT = { + "xavier_uniform": nn.init.xavier_uniform_, +} + + +class GenericMLP(nn.Module): + def __init__( + self, + input_dim, + hidden_dims, + output_dim, + norm_fn_name=None, + activation="relu", + use_conv=False, + dropout=None, + hidden_use_bias=False, + output_use_bias=True, + output_use_activation=False, + output_use_norm=False, + weight_init_name=None, + ): + super().__init__() + activation = ACTIVATION_DICT[activation] + norm = None + if norm_fn_name is not None: + norm = NORM_DICT[norm_fn_name] + if norm_fn_name == "ln" and use_conv: + norm = lambda x: nn.GroupNorm(1, x) # easier way to use LayerNorm + + if dropout is not None: + if not isinstance(dropout, list): + dropout = [dropout for _ in range(len(hidden_dims))] + + layers = [] + prev_dim = input_dim + for idx, x in enumerate(hidden_dims): + if use_conv: + layer = nn.Conv1d(prev_dim, x, 1, bias=hidden_use_bias) + else: + layer = nn.Linear(prev_dim, x, bias=hidden_use_bias) + layers.append(layer) + if norm: + layers.append(norm(x)) + layers.append(activation()) + if dropout is not None: + layers.append(nn.Dropout(p=dropout[idx])) + prev_dim = x + if use_conv: + layer = nn.Conv1d(prev_dim, output_dim, 1, bias=output_use_bias) + else: + layer = nn.Linear(prev_dim, output_dim, bias=output_use_bias) + layers.append(layer) + + if output_use_norm: + layers.append(norm(output_dim)) + + if output_use_activation: + layers.append(activation()) + + self.layers = nn.Sequential(*layers) + + if weight_init_name is not None: + self.do_weight_init(weight_init_name) + + def do_weight_init(self, weight_init_name): + func = WEIGHT_INIT_DICT[weight_init_name] + for (_, param) in self.named_parameters(): + if param.dim() > 1: # skips batchnorm/layernorm + func(param) + + def forward(self, x): + output = self.layers(x) + return output + + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/models/Mask3D/mask3d/models/modules/__init__.py b/models/Mask3D/mask3d/models/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/mask3d/models/modules/common.py b/models/Mask3D/mask3d/models/modules/common.py new file mode 100644 index 0000000000000000000000000000000000000000..ae78b5b301cfd6ffcfc3417b543ebe2289602fb7 --- /dev/null +++ b/models/Mask3D/mask3d/models/modules/common.py @@ -0,0 +1,275 @@ +import sys + +if sys.version_info[:2] >= (3, 8): + from collections.abc import Sequence +else: + from collections import Sequence + +from enum import Enum + +import torch.nn as nn +import MinkowskiEngine as ME + + +class NormType(Enum): + BATCH_NORM = 0 + INSTANCE_NORM = 1 + INSTANCE_BATCH_NORM = 2 + + +def get_norm(norm_type, n_channels, D, bn_momentum=0.1): + if norm_type == NormType.BATCH_NORM: + return ME.MinkowskiBatchNorm(n_channels, momentum=bn_momentum) + elif norm_type == NormType.INSTANCE_NORM: + return ME.MinkowskiInstanceNorm(n_channels) + elif norm_type == NormType.INSTANCE_BATCH_NORM: + return nn.Sequential( + ME.MinkowskiInstanceNorm(n_channels), + ME.MinkowskiBatchNorm(n_channels, momentum=bn_momentum), + ) + else: + raise ValueError(f"Norm type: {norm_type} not supported") + + +class ConvType(Enum): + """ + Define the kernel region type + """ + + HYPERCUBE = 0, "HYPERCUBE" + SPATIAL_HYPERCUBE = 1, "SPATIAL_HYPERCUBE" + SPATIO_TEMPORAL_HYPERCUBE = 2, "SPATIO_TEMPORAL_HYPERCUBE" + HYPERCROSS = 3, "HYPERCROSS" + SPATIAL_HYPERCROSS = 4, "SPATIAL_HYPERCROSS" + SPATIO_TEMPORAL_HYPERCROSS = 5, "SPATIO_TEMPORAL_HYPERCROSS" + SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS = ( + 6, + "SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS ", + ) + + def __new__(cls, value, name): + member = object.__new__(cls) + member._value_ = value + member.fullname = name + return member + + def __int__(self): + return self.value + + +# Convert the ConvType var to a RegionType var +conv_to_region_type = { + # kernel_size = [k, k, k, 1] + ConvType.HYPERCUBE: ME.RegionType.HYPER_CUBE, + ConvType.SPATIAL_HYPERCUBE: ME.RegionType.HYPER_CUBE, + ConvType.SPATIO_TEMPORAL_HYPERCUBE: ME.RegionType.HYPER_CUBE, + ConvType.HYPERCROSS: ME.RegionType.HYPER_CROSS, + ConvType.SPATIAL_HYPERCROSS: ME.RegionType.HYPER_CROSS, + ConvType.SPATIO_TEMPORAL_HYPERCROSS: ME.RegionType.HYPER_CROSS, + ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS: ME.RegionType.HYPER_CUBE, # JONAS CHANGE from HYBRID +} + +# int_to_region_type = {m.value: m for m in ME.RegionType} +int_to_region_type = {m: ME.RegionType(m) for m in range(3)} + + +def convert_region_type(region_type): + """ + Convert the integer region_type to the corresponding RegionType enum object. + """ + return int_to_region_type[region_type] + + +def convert_conv_type(conv_type, kernel_size, D): + assert isinstance(conv_type, ConvType), "conv_type must be of ConvType" + region_type = conv_to_region_type[conv_type] + axis_types = None + if conv_type == ConvType.SPATIAL_HYPERCUBE: + # No temporal convolution + if isinstance(kernel_size, Sequence): + kernel_size = kernel_size[:3] + else: + kernel_size = [ + kernel_size, + ] * 3 + if D == 4: + kernel_size.append(1) + elif conv_type == ConvType.SPATIO_TEMPORAL_HYPERCUBE: + # conv_type conversion already handled + assert D == 4 + elif conv_type == ConvType.HYPERCUBE: + # conv_type conversion already handled + pass + elif conv_type == ConvType.SPATIAL_HYPERCROSS: + if isinstance(kernel_size, Sequence): + kernel_size = kernel_size[:3] + else: + kernel_size = [ + kernel_size, + ] * 3 + if D == 4: + kernel_size.append(1) + elif conv_type == ConvType.HYPERCROSS: + # conv_type conversion already handled + pass + elif conv_type == ConvType.SPATIO_TEMPORAL_HYPERCROSS: + # conv_type conversion already handled + assert D == 4 + elif conv_type == ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS: + # Define the CUBIC conv kernel for spatial dims and CROSS conv for temp dim + axis_types = [ + ME.RegionType.HYPER_CUBE, + ] * 3 + if D == 4: + axis_types.append(ME.RegionType.HYPER_CROSS) + return region_type, axis_types, kernel_size + + +def conv( + in_planes, + out_planes, + kernel_size, + stride=1, + dilation=1, + bias=False, + conv_type=ConvType.HYPERCUBE, + D=-1, +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + stride, + dilation, + region_type=region_type, + axis_types=None, # axis_types JONAS + dimension=D, + ) + + return ME.MinkowskiConvolution( + in_channels=in_planes, + out_channels=out_planes, + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + bias=bias, + kernel_generator=kernel_generator, + dimension=D, + ) + + +def conv_tr( + in_planes, + out_planes, + kernel_size, + upsample_stride=1, + dilation=1, + bias=False, + conv_type=ConvType.HYPERCUBE, + D=-1, +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + upsample_stride, + dilation, + region_type=region_type, + axis_types=axis_types, + dimension=D, + ) + + return ME.MinkowskiConvolutionTranspose( + in_channels=in_planes, + out_channels=out_planes, + kernel_size=kernel_size, + stride=upsample_stride, + dilation=dilation, + bias=bias, + kernel_generator=kernel_generator, + dimension=D, + ) + + +def avg_pool( + kernel_size, + stride=1, + dilation=1, + conv_type=ConvType.HYPERCUBE, + in_coords_key=None, + D=-1, +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + stride, + dilation, + region_type=region_type, + axis_types=axis_types, + dimension=D, + ) + + return ME.MinkowskiAvgPooling( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + kernel_generator=kernel_generator, + dimension=D, + ) + + +def avg_unpool( + kernel_size, stride=1, dilation=1, conv_type=ConvType.HYPERCUBE, D=-1 +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + stride, + dilation, + region_type=region_type, + axis_types=axis_types, + dimension=D, + ) + + return ME.MinkowskiAvgUnpooling( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + kernel_generator=kernel_generator, + dimension=D, + ) + + +def sum_pool( + kernel_size, stride=1, dilation=1, conv_type=ConvType.HYPERCUBE, D=-1 +): + assert D > 0, "Dimension must be a positive integer" + region_type, axis_types, kernel_size = convert_conv_type( + conv_type, kernel_size, D + ) + kernel_generator = ME.KernelGenerator( + kernel_size, + stride, + dilation, + region_type=region_type, + axis_types=axis_types, + dimension=D, + ) + + return ME.MinkowskiSumPooling( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + kernel_generator=kernel_generator, + dimension=D, + ) diff --git a/models/Mask3D/mask3d/models/modules/helpers_3detr.py b/models/Mask3D/mask3d/models/modules/helpers_3detr.py new file mode 100644 index 0000000000000000000000000000000000000000..2c3f7ea57c0266a9781cdfec9f59896d15750a9d --- /dev/null +++ b/models/Mask3D/mask3d/models/modules/helpers_3detr.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import torch.nn as nn +from functools import partial +import copy + + +class BatchNormDim1Swap(nn.BatchNorm1d): + """ + Used for nn.Transformer that uses a HW x N x C rep + """ + + def forward(self, x): + """ + x: HW x N x C + permute to N x C x HW + Apply BN on C + permute back + """ + hw, n, c = x.shape + x = x.permute(1, 2, 0) + x = super(BatchNormDim1Swap, self).forward(x) + # x: n x c x hw -> hw x n x c + x = x.permute(2, 0, 1) + return x + + +NORM_DICT = { + "bn": BatchNormDim1Swap, + "bn1d": nn.BatchNorm1d, + "id": nn.Identity, + "ln": nn.LayerNorm, +} + +ACTIVATION_DICT = { + "relu": nn.ReLU, + "gelu": nn.GELU, + "leakyrelu": partial(nn.LeakyReLU, negative_slope=0.1), +} + +WEIGHT_INIT_DICT = { + "xavier_uniform": nn.init.xavier_uniform_, +} + + +class GenericMLP(nn.Module): + def __init__( + self, + input_dim, + hidden_dims, + output_dim, + norm_fn_name=None, + activation="relu", + use_conv=False, + dropout=None, + hidden_use_bias=False, + output_use_bias=True, + output_use_activation=False, + output_use_norm=False, + weight_init_name=None, + ): + super().__init__() + activation = ACTIVATION_DICT[activation] + norm = None + if norm_fn_name is not None: + norm = NORM_DICT[norm_fn_name] + if norm_fn_name == "ln" and use_conv: + norm = lambda x: nn.GroupNorm(1, x) # easier way to use LayerNorm + + if dropout is not None: + if not isinstance(dropout, list): + dropout = [dropout for _ in range(len(hidden_dims))] + + layers = [] + prev_dim = input_dim + for idx, x in enumerate(hidden_dims): + if use_conv: + layer = nn.Conv1d(prev_dim, x, 1, bias=hidden_use_bias) + else: + layer = nn.Linear(prev_dim, x, bias=hidden_use_bias) + layers.append(layer) + if norm: + layers.append(norm(x)) + layers.append(activation()) + if dropout is not None: + layers.append(nn.Dropout(p=dropout[idx])) + prev_dim = x + if use_conv: + layer = nn.Conv1d(prev_dim, output_dim, 1, bias=output_use_bias) + else: + layer = nn.Linear(prev_dim, output_dim, bias=output_use_bias) + layers.append(layer) + + if output_use_norm: + layers.append(norm(output_dim)) + + if output_use_activation: + layers.append(activation()) + + self.layers = nn.Sequential(*layers) + + if weight_init_name is not None: + self.do_weight_init(weight_init_name) + + def do_weight_init(self, weight_init_name): + func = WEIGHT_INIT_DICT[weight_init_name] + for (_, param) in self.named_parameters(): + if param.dim() > 1: # skips batchnorm/layernorm + func(param) + + def forward(self, x): + output = self.layers(x) + return output + + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) diff --git a/models/Mask3D/mask3d/models/modules/resnet_block.py b/models/Mask3D/mask3d/models/modules/resnet_block.py new file mode 100644 index 0000000000000000000000000000000000000000..ac16b72aa198964e343f57ad4f79193a22e830dc --- /dev/null +++ b/models/Mask3D/mask3d/models/modules/resnet_block.py @@ -0,0 +1,157 @@ +import torch.nn as nn +from MinkowskiEngine import MinkowskiReLU + +from mask3d.models.modules.common import ConvType, NormType, conv, get_norm + + +class BasicBlockBase(nn.Module): + expansion = 1 + NORM_TYPE = NormType.BATCH_NORM + + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + bn_momentum=0.1, + D=3, + ): + super().__init__() + + self.conv1 = conv( + inplanes, + planes, + kernel_size=3, + stride=stride, + dilation=dilation, + conv_type=conv_type, + D=D, + ) + self.norm1 = get_norm( + self.NORM_TYPE, planes, D, bn_momentum=bn_momentum + ) + self.conv2 = conv( + planes, + planes, + kernel_size=3, + stride=1, + dilation=dilation, + bias=False, + conv_type=conv_type, + D=D, + ) + self.norm2 = get_norm( + self.NORM_TYPE, planes, D, bn_momentum=bn_momentum + ) + self.relu = MinkowskiReLU(inplace=True) + self.downsample = downsample + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class BasicBlock(BasicBlockBase): + NORM_TYPE = NormType.BATCH_NORM + + +class BasicBlockIN(BasicBlockBase): + NORM_TYPE = NormType.INSTANCE_NORM + + +class BasicBlockINBN(BasicBlockBase): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM + + +class BottleneckBase(nn.Module): + expansion = 4 + NORM_TYPE = NormType.BATCH_NORM + + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + bn_momentum=0.1, + D=3, + ): + super().__init__() + self.conv1 = conv(inplanes, planes, kernel_size=1, D=D) + self.norm1 = get_norm( + self.NORM_TYPE, planes, D, bn_momentum=bn_momentum + ) + + self.conv2 = conv( + planes, + planes, + kernel_size=3, + stride=stride, + dilation=dilation, + conv_type=conv_type, + D=D, + ) + self.norm2 = get_norm( + self.NORM_TYPE, planes, D, bn_momentum=bn_momentum + ) + + self.conv3 = conv(planes, planes * self.expansion, kernel_size=1, D=D) + self.norm3 = get_norm( + self.NORM_TYPE, planes * self.expansion, D, bn_momentum=bn_momentum + ) + + self.relu = MinkowskiReLU(inplace=True) + self.downsample = downsample + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.norm3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(BottleneckBase): + NORM_TYPE = NormType.BATCH_NORM + + +class BottleneckIN(BottleneckBase): + NORM_TYPE = NormType.INSTANCE_NORM + + +class BottleneckINBN(BottleneckBase): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM diff --git a/models/Mask3D/mask3d/models/modules/resnet_block.py.tmp b/models/Mask3D/mask3d/models/modules/resnet_block.py.tmp new file mode 100644 index 0000000000000000000000000000000000000000..00dba24b9ab660fd2fc2b6f2f88c508d0b62db0b --- /dev/null +++ b/models/Mask3D/mask3d/models/modules/resnet_block.py.tmp @@ -0,0 +1,149 @@ +import torch.nn as nn +from MinkowskiEngine import MinkowskiReLU + +from mix3d.models.modules.common import ConvType, NormType, conv, get_norm + + +class BasicBlockBase(nn.Module): + expansion = 1 + NORM_TYPE = NormType.BATCH_NORM + + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + bn_momentum=0.1, + D=3, + ): + super().__init__() + + self.conv1 = conv( + inplanes, + planes, + kernel_size=3, + stride=stride, + dilation=dilation, + conv_type=conv_type, + D=D, + ) + self.norm1 = get_norm(self.NORM_TYPE, planes, D, bn_momentum=bn_momentum) + self.conv2 = conv( + planes, + planes, + kernel_size=3, + stride=1, + dilation=dilation, + bias=False, + conv_type=conv_type, + D=D, + ) + self.norm2 = get_norm(self.NORM_TYPE, planes, D, bn_momentum=bn_momentum) + self.relu = MinkowskiReLU(inplace=True) + self.downsample = downsample + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class BasicBlock(BasicBlockBase): + NORM_TYPE = NormType.BATCH_NORM + + +class BasicBlockIN(BasicBlockBase): + NORM_TYPE = NormType.INSTANCE_NORM + + +class BasicBlockINBN(BasicBlockBase): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM + + +class BottleneckBase(nn.Module): + expansion = 4 + NORM_TYPE = NormType.BATCH_NORM + + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + bn_momentum=0.1, + D=3, + ): + super().__init__() + self.conv1 = conv(inplanes, planes, kernel_size=1, D=D) + self.norm1 = get_norm(self.NORM_TYPE, planes, D, bn_momentum=bn_momentum) + + self.conv2 = conv( + planes, + planes, + kernel_size=3, + stride=stride, + dilation=dilation, + conv_type=conv_type, + D=D, + ) + self.norm2 = get_norm(self.NORM_TYPE, planes, D, bn_momentum=bn_momentum) + + self.conv3 = conv(planes, planes * self.expansion, kernel_size=1, D=D) + self.norm3 = get_norm( + self.NORM_TYPE, planes * self.expansion, D, bn_momentum=bn_momentum + ) + + self.relu = MinkowskiReLU(inplace=True) + self.downsample = downsample + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.norm3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(BottleneckBase): + NORM_TYPE = NormType.BATCH_NORM + + +class BottleneckIN(BottleneckBase): + NORM_TYPE = NormType.INSTANCE_NORM + + +class BottleneckINBN(BottleneckBase): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM diff --git a/models/Mask3D/mask3d/models/modules/senet_block.py b/models/Mask3D/mask3d/models/modules/senet_block.py new file mode 100644 index 0000000000000000000000000000000000000000..130082738505c79d5ecddb010595a5a66b9d8509 --- /dev/null +++ b/models/Mask3D/mask3d/models/modules/senet_block.py @@ -0,0 +1,138 @@ +import torch.nn as nn +import MinkowskiEngine as ME + +from mix3d.models.modules.common import ConvType, NormType +from mix3d.models.modules.resnet_block import BasicBlock, Bottleneck + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=16, D=-1): + # Global coords does not require coords_key + super().__init__() + self.fc = nn.Sequential( + ME.MinkowskiLinear(channel, channel // reduction), + ME.MinkowskiReLU(inplace=True), + ME.MinkowskiLinear(channel // reduction, channel), + ME.MinkowskiSigmoid(), + ) + self.pooling = ME.MinkowskiGlobalPooling(dimension=D) + self.broadcast_mul = ME.MinkowskiBroadcastMultiplication(dimension=D) + + def forward(self, x): + y = self.pooling(x) + y = self.fc(y) + return self.broadcast_mul(x, y) + + +class SEBasicBlock(BasicBlock): + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + reduction=16, + D=-1, + ): + super().__init__( + inplanes, + planes, + stride=stride, + dilation=dilation, + downsample=downsample, + conv_type=conv_type, + D=D, + ) + self.se = SELayer(planes, reduction=reduction, D=D) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class SEBasicBlockSN(SEBasicBlock): + NORM_TYPE = NormType.SPARSE_SWITCH_NORM + + +class SEBasicBlockIN(SEBasicBlock): + NORM_TYPE = NormType.SPARSE_INSTANCE_NORM + + +class SEBasicBlockLN(SEBasicBlock): + NORM_TYPE = NormType.SPARSE_LAYER_NORM + + +class SEBottleneck(Bottleneck): + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + conv_type=ConvType.HYPERCUBE, + D=3, + reduction=16, + ): + super().__init__( + inplanes, + planes, + stride=stride, + dilation=dilation, + downsample=downsample, + conv_type=conv_type, + D=D, + ) + self.se = SELayer(planes * self.expansion, reduction=reduction, D=D) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.norm3(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class SEBottleneckSN(SEBottleneck): + NORM_TYPE = NormType.SPARSE_SWITCH_NORM + + +class SEBottleneckIN(SEBottleneck): + NORM_TYPE = NormType.SPARSE_INSTANCE_NORM + + +class SEBottleneckLN(SEBottleneck): + NORM_TYPE = NormType.SPARSE_LAYER_NORM diff --git a/models/Mask3D/mask3d/models/position_embedding.py b/models/Mask3D/mask3d/models/position_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..70275f1610e1d3f5ec8d11d18d298b7877204b86 --- /dev/null +++ b/models/Mask3D/mask3d/models/position_embedding.py @@ -0,0 +1,179 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Various positional encodings for the transformer. +""" +import math +import torch +from torch import nn +import numpy as np + +# from utils.pc_util import shift_scale_points + + +def shift_scale_points(pred_xyz, src_range, dst_range=None): + """ + pred_xyz: B x N x 3 + src_range: [[B x 3], [B x 3]] - min and max XYZ coords + dst_range: [[B x 3], [B x 3]] - min and max XYZ coords + """ + if dst_range is None: + dst_range = [ + torch.zeros( + (src_range[0].shape[0], 3), device=src_range[0].device + ), + torch.ones((src_range[0].shape[0], 3), device=src_range[0].device), + ] + + if pred_xyz.ndim == 4: + src_range = [x[:, None] for x in src_range] + dst_range = [x[:, None] for x in dst_range] + + assert src_range[0].shape[0] == pred_xyz.shape[0] + assert dst_range[0].shape[0] == pred_xyz.shape[0] + assert src_range[0].shape[-1] == pred_xyz.shape[-1] + assert src_range[0].shape == src_range[1].shape + assert dst_range[0].shape == dst_range[1].shape + assert src_range[0].shape == dst_range[1].shape + + src_diff = src_range[1][:, None, :] - src_range[0][:, None, :] + dst_diff = dst_range[1][:, None, :] - dst_range[0][:, None, :] + prop_xyz = ( + ((pred_xyz - src_range[0][:, None, :]) * dst_diff) / src_diff + ) + dst_range[0][:, None, :] + return prop_xyz + + +class PositionEmbeddingCoordsSine(nn.Module): + def __init__( + self, + temperature=10000, + normalize=False, + scale=None, + pos_type="fourier", + d_pos=None, + d_in=3, + gauss_scale=1.0, + ): + super().__init__() + self.d_pos = d_pos + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + assert pos_type in ["sine", "fourier"] + self.pos_type = pos_type + self.scale = scale + if pos_type == "fourier": + assert d_pos is not None + assert d_pos % 2 == 0 + # define a gaussian matrix input_ch -> output_ch + B = torch.empty((d_in, d_pos // 2)).normal_() + B *= gauss_scale + self.register_buffer("gauss_B", B) + self.d_pos = d_pos + + def get_sine_embeddings(self, xyz, num_channels, input_range): + num_channels = self.d_pos + # clone coords so that shift/scale operations do not affect original tensor + orig_xyz = xyz + xyz = orig_xyz.clone() + + ncoords = xyz.shape[1] + if self.normalize: + xyz = shift_scale_points(xyz, src_range=input_range) + + ndim = num_channels // xyz.shape[2] + if ndim % 2 != 0: + ndim -= 1 + # automatically handle remainder by assiging it to the first dim + rems = num_channels - (ndim * xyz.shape[2]) + + assert ( + ndim % 2 == 0 + ), f"Cannot handle odd sized ndim={ndim} where num_channels={num_channels} and xyz={xyz.shape}" + + final_embeds = [] + prev_dim = 0 + + for d in range(xyz.shape[2]): + cdim = ndim + if rems > 0: + # add remainder in increments of two to maintain even size + cdim += 2 + rems -= 2 + + if cdim != prev_dim: + dim_t = torch.arange( + cdim, dtype=torch.float32, device=xyz.device + ) + dim_t = self.temperature ** (2 * (dim_t // 2) / cdim) + + # create batch x cdim x nccords embedding + raw_pos = xyz[:, :, d] + if self.scale: + raw_pos *= self.scale + pos = raw_pos[:, :, None] / dim_t + pos = torch.stack( + (pos[:, :, 0::2].sin(), pos[:, :, 1::2].cos()), dim=3 + ).flatten(2) + final_embeds.append(pos) + prev_dim = cdim + + final_embeds = torch.cat(final_embeds, dim=2).permute(0, 2, 1) + return final_embeds + + def get_fourier_embeddings(self, xyz, num_channels=None, input_range=None): + # Follows - https://people.eecs.berkeley.edu/~bmild/fourfeat/index.html + + if num_channels is None: + num_channels = self.gauss_B.shape[1] * 2 + + bsize, npoints = xyz.shape[0], xyz.shape[1] + assert num_channels > 0 and num_channels % 2 == 0 + d_in, max_d_out = self.gauss_B.shape[0], self.gauss_B.shape[1] + d_out = num_channels // 2 + assert d_out <= max_d_out + assert d_in == xyz.shape[-1] + + # clone coords so that shift/scale operations do not affect original tensor + orig_xyz = xyz + xyz = orig_xyz.clone() + + ncoords = xyz.shape[1] + if self.normalize: + xyz = shift_scale_points(xyz, src_range=input_range) + + xyz *= 2 * np.pi + xyz_proj = torch.mm(xyz.view(-1, d_in), self.gauss_B[:, :d_out]).view( + bsize, npoints, d_out + ) + final_embeds = [xyz_proj.sin(), xyz_proj.cos()] + + # return batch x d_pos x npoints embedding + final_embeds = torch.cat(final_embeds, dim=2).permute(0, 2, 1) + return final_embeds + + def forward(self, xyz, num_channels=None, input_range=None): + assert isinstance(xyz, torch.Tensor) + assert xyz.ndim == 3 + # xyz is batch x npoints x 3 + if self.pos_type == "sine": + with torch.no_grad(): + out = self.get_sine_embeddings(xyz, num_channels, input_range) + elif self.pos_type == "fourier": + with torch.no_grad(): + out = self.get_fourier_embeddings( + xyz, num_channels, input_range + ) + else: + raise ValueError(f"Unknown {self.pos_type}") + + return out + + def extra_repr(self): + st = f"type={self.pos_type}, scale={self.scale}, normalize={self.normalize}" + if hasattr(self, "gauss_B"): + st += f", gaussB={self.gauss_B.shape}, gaussBsum={self.gauss_B.sum().item()}" + return st diff --git a/models/Mask3D/mask3d/models/res16unet.py b/models/Mask3D/mask3d/models/res16unet.py new file mode 100644 index 0000000000000000000000000000000000000000..db771a6f12341b70d9e27e8f61efc2878b5d12c3 --- /dev/null +++ b/models/Mask3D/mask3d/models/res16unet.py @@ -0,0 +1,444 @@ +import MinkowskiEngine.MinkowskiOps as me +from MinkowskiEngine import MinkowskiReLU + +from mask3d.models.resnet import ResNetBase, get_norm +from mask3d.models.modules.common import ConvType, NormType, conv, conv_tr +from mask3d.models.modules.resnet_block import BasicBlock, Bottleneck + + +class Res16UNetBase(ResNetBase): + BLOCK = None + PLANES = (32, 64, 128, 256, 256, 256, 256, 256) + DILATIONS = (1, 1, 1, 1, 1, 1, 1, 1) + LAYERS = (2, 2, 2, 2, 2, 2, 2, 2) + INIT_DIM = 32 + OUT_PIXEL_DIST = 1 + NORM_TYPE = NormType.BATCH_NORM + NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + # To use the model, must call initialize_coords before forward pass. + # Once data is processed, call clear to reset the model before calling initialize_coords + def __init__( + self, in_channels, out_channels, config, D=3, out_fpn=False, **kwargs + ): + super().__init__(in_channels, out_channels, config, D) + self.out_fpn = out_fpn + + def network_initialization(self, in_channels, out_channels, config, D): + # Setup net_metadata + dilations = self.DILATIONS + bn_momentum = config.bn_momentum + + def space_n_time_m(n, m): + return n if D == 3 else [n, n, n, m] + + if D == 4: + self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1) + + # Output of the first conv concated to conv6 + self.inplanes = self.INIT_DIM + self.conv0p1s1 = conv( + in_channels, + self.inplanes, + kernel_size=space_n_time_m(config.conv1_kernel_size, 1), + stride=1, + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + + self.bn0 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + + self.conv1p1s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn1 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block1 = self._make_layer( + self.BLOCK, + self.PLANES[0], + self.LAYERS[0], + dilation=dilations[0], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv2p2s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn2 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block2 = self._make_layer( + self.BLOCK, + self.PLANES[1], + self.LAYERS[1], + dilation=dilations[1], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv3p4s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn3 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block3 = self._make_layer( + self.BLOCK, + self.PLANES[2], + self.LAYERS[2], + dilation=dilations[2], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv4p8s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn4 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block4 = self._make_layer( + self.BLOCK, + self.PLANES[3], + self.LAYERS[3], + dilation=dilations[3], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr4p16s2 = conv_tr( + self.inplanes, + self.PLANES[4], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr4 = get_norm( + self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion + self.block5 = self._make_layer( + self.BLOCK, + self.PLANES[4], + self.LAYERS[4], + dilation=dilations[4], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr5p8s2 = conv_tr( + self.inplanes, + self.PLANES[5], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr5 = get_norm( + self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion + self.block6 = self._make_layer( + self.BLOCK, + self.PLANES[5], + self.LAYERS[5], + dilation=dilations[5], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr6p4s2 = conv_tr( + self.inplanes, + self.PLANES[6], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr6 = get_norm( + self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[6] + self.PLANES[0] * self.BLOCK.expansion + self.block7 = self._make_layer( + self.BLOCK, + self.PLANES[6], + self.LAYERS[6], + dilation=dilations[6], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr7p2s2 = conv_tr( + self.inplanes, + self.PLANES[7], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr7 = get_norm( + self.NORM_TYPE, self.PLANES[7], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[7] + self.INIT_DIM + self.block8 = self._make_layer( + self.BLOCK, + self.PLANES[7], + self.LAYERS[7], + dilation=dilations[7], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.final = conv( + self.PLANES[7], + out_channels, + kernel_size=1, + stride=1, + bias=True, + D=D, + ) + self.relu = MinkowskiReLU(inplace=True) + + def forward(self, x): + feature_maps = [] + + out = self.conv0p1s1(x) + out = self.bn0(out) + out_p1 = self.relu(out) + + out = self.conv1p1s2(out_p1) + out = self.bn1(out) + out = self.relu(out) + out_b1p2 = self.block1(out) + + out = self.conv2p2s2(out_b1p2) + out = self.bn2(out) + out = self.relu(out) + out_b2p4 = self.block2(out) + + out = self.conv3p4s2(out_b2p4) + out = self.bn3(out) + out = self.relu(out) + out_b3p8 = self.block3(out) + + # pixel_dist=16 + out = self.conv4p8s2(out_b3p8) + out = self.bn4(out) + out = self.relu(out) + out = self.block4(out) + + feature_maps.append(out) + + # pixel_dist=8 + out = self.convtr4p16s2(out) + out = self.bntr4(out) + out = self.relu(out) + + out = me.cat(out, out_b3p8) + out = self.block5(out) + + feature_maps.append(out) + + # pixel_dist=4 + out = self.convtr5p8s2(out) + out = self.bntr5(out) + out = self.relu(out) + + out = me.cat(out, out_b2p4) + out = self.block6(out) + + feature_maps.append(out) + + # pixel_dist=2 + out = self.convtr6p4s2(out) + out = self.bntr6(out) + out = self.relu(out) + + out = me.cat(out, out_b1p2) + out = self.block7(out) + + feature_maps.append(out) + + # pixel_dist=1 + out = self.convtr7p2s2(out) + out = self.bntr7(out) + out = self.relu(out) + + out = me.cat(out, out_p1) + out = self.block8(out) + + feature_maps.append(out) + + if not self.out_fpn: + return out + else: + return out, feature_maps + + +class Res16UNet14(Res16UNetBase): + BLOCK = BasicBlock + LAYERS = (1, 1, 1, 1, 1, 1, 1, 1) + + +class Res16UNet18(Res16UNetBase): + BLOCK = BasicBlock + LAYERS = (2, 2, 2, 2, 2, 2, 2, 2) + + +class Res16UNet34(Res16UNetBase): + BLOCK = BasicBlock + LAYERS = (2, 3, 4, 6, 2, 2, 2, 2) + + +class Res16UNet50(Res16UNetBase): + BLOCK = Bottleneck + LAYERS = (2, 3, 4, 6, 2, 2, 2, 2) + + +class Res16UNet101(Res16UNetBase): + BLOCK = Bottleneck + LAYERS = (2, 3, 4, 23, 2, 2, 2, 2) + + +class Res16UNet14A(Res16UNet14): + PLANES = (32, 64, 128, 256, 128, 128, 96, 96) + + +class Res16UNet14A2(Res16UNet14A): + LAYERS = (1, 1, 1, 1, 2, 2, 2, 2) + + +class Res16UNet14B(Res16UNet14): + PLANES = (32, 64, 128, 256, 128, 128, 128, 128) + + +class Res16UNet14B2(Res16UNet14B): + LAYERS = (1, 1, 1, 1, 2, 2, 2, 2) + + +class Res16UNet14B3(Res16UNet14B): + LAYERS = (2, 2, 2, 2, 1, 1, 1, 1) + + +class Res16UNet14C(Res16UNet14): + PLANES = (32, 64, 128, 256, 192, 192, 128, 128) + + +class Res16UNet14D(Res16UNet14): + PLANES = (32, 64, 128, 256, 384, 384, 384, 384) + + +class Res16UNet18A(Res16UNet18): + PLANES = (32, 64, 128, 256, 128, 128, 96, 96) + + +class Res16UNet18B(Res16UNet18): + PLANES = (32, 64, 128, 256, 128, 128, 128, 128) + + +class Res16UNet18D(Res16UNet18): + PLANES = (32, 64, 128, 256, 384, 384, 384, 384) + + +class Res16UNet34A(Res16UNet34): + PLANES = (32, 64, 128, 256, 256, 128, 64, 64) + + +class Res16UNet34B(Res16UNet34): + PLANES = (32, 64, 128, 256, 256, 128, 64, 32) + + +class Res16UNet34C(Res16UNet34): + PLANES = (32, 64, 128, 256, 256, 128, 96, 96) + + +class Custom30M(Res16UNet34): + PLANES = (32, 64, 128, 256, 128, 64, 64, 32) + + +class Res16UNet34D(Res16UNet34): + PLANES = (32, 64, 128, 256, 256, 128, 96, 128) + + +class STRes16UNetBase(Res16UNetBase): + + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + def __init__(self, in_channels, out_channels, config, D=4, **kwargs): + super().__init__(in_channels, out_channels, config, D, **kwargs) + + +class STRes16UNet14(STRes16UNetBase, Res16UNet14): + pass + + +class STRes16UNet14A(STRes16UNetBase, Res16UNet14A): + pass + + +class STRes16UNet18(STRes16UNetBase, Res16UNet18): + pass + + +class STRes16UNet34(STRes16UNetBase, Res16UNet34): + pass + + +class STRes16UNet50(STRes16UNetBase, Res16UNet50): + pass + + +class STRes16UNet101(STRes16UNetBase, Res16UNet101): + pass + + +class STRes16UNet18A(STRes16UNet18): + PLANES = (32, 64, 128, 256, 128, 128, 96, 96) + + +class STResTesseract16UNetBase(STRes16UNetBase): + pass + # CONV_TYPE = ConvType.HYPERCUBE + + +class STResTesseract16UNet18A(STRes16UNet18A, STResTesseract16UNetBase): + pass diff --git a/models/Mask3D/mask3d/models/resnet.py b/models/Mask3D/mask3d/models/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f6ad622893d191fce0cf9db6edafbc83f684d218 --- /dev/null +++ b/models/Mask3D/mask3d/models/resnet.py @@ -0,0 +1,243 @@ +import torch.nn as nn +import MinkowskiEngine as ME + +from mask3d.models.model import Model +from mask3d.models.modules.common import ConvType, NormType, conv, get_norm, sum_pool +from mask3d.models.modules.resnet_block import BasicBlock, Bottleneck + + +class ResNetBase(Model): + BLOCK = None + LAYERS = () + INIT_DIM = 64 + PLANES = (64, 128, 256, 512) + OUT_PIXEL_DIST = 32 + HAS_LAST_BLOCK = False + CONV_TYPE = ConvType.HYPERCUBE + + def __init__(self, in_channels, out_channels, config, D=3, **kwargs): + assert self.BLOCK is not None + assert self.OUT_PIXEL_DIST > 0 + + super().__init__(in_channels, out_channels, config, D, **kwargs) + + self.network_initialization(in_channels, out_channels, config, D) + self.weight_initialization() + + def network_initialization(self, in_channels, out_channels, config, D): + def space_n_time_m(n, m): + return n if D == 3 else [n, n, n, m] + + if D == 4: + self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1) + + dilations = config.dilations + bn_momentum = config.bn_momentum + self.inplanes = self.INIT_DIM + self.conv1 = conv( + in_channels, + self.inplanes, + kernel_size=space_n_time_m(config.conv1_kernel_size, 1), + stride=1, + D=D, + ) + + self.bn1 = get_norm( + NormType.BATCH_NORM, + self.inplanes, + D=self.D, + bn_momentum=bn_momentum, + ) + self.relu = ME.MinkowskiReLU(inplace=True) + self.pool = sum_pool( + kernel_size=space_n_time_m(2, 1), stride=space_n_time_m(2, 1), D=D + ) + + self.layer1 = self._make_layer( + self.BLOCK, + self.PLANES[0], + self.LAYERS[0], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[0], 1), + ) + self.layer2 = self._make_layer( + self.BLOCK, + self.PLANES[1], + self.LAYERS[1], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[1], 1), + ) + self.layer3 = self._make_layer( + self.BLOCK, + self.PLANES[2], + self.LAYERS[2], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[2], 1), + ) + self.layer4 = self._make_layer( + self.BLOCK, + self.PLANES[3], + self.LAYERS[3], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[3], 1), + ) + + self.final = conv( + self.PLANES[3] * self.BLOCK.expansion, + out_channels, + kernel_size=1, + bias=True, + D=D, + ) + + def weight_initialization(self): + for m in self.modules(): + if isinstance(m, ME.MinkowskiBatchNorm): + nn.init.constant_(m.bn.weight, 1) + nn.init.constant_(m.bn.bias, 0) + + def _make_layer( + self, + block, + planes, + blocks, + stride=1, + dilation=1, + norm_type=NormType.BATCH_NORM, + bn_momentum=0.1, + ): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + D=self.D, + ), + get_norm( + norm_type, + planes * block.expansion, + D=self.D, + bn_momentum=bn_momentum, + ), + ) + layers = [] + layers.append( + block( + self.inplanes, + planes, + stride=stride, + dilation=dilation, + downsample=downsample, + conv_type=self.CONV_TYPE, + D=self.D, + ) + ) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + self.inplanes, + planes, + stride=1, + dilation=dilation, + conv_type=self.CONV_TYPE, + D=self.D, + ) + ) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.pool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.final(x) + return x + + +class ResNet14(ResNetBase): + BLOCK = BasicBlock + LAYERS = (1, 1, 1, 1) + + +class ResNet18(ResNetBase): + BLOCK = BasicBlock + LAYERS = (2, 2, 2, 2) + + +class ResNet34(ResNetBase): + BLOCK = BasicBlock + LAYERS = (3, 4, 6, 3) + + +class ResNet50(ResNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 6, 3) + + +class ResNet101(ResNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 23, 3) + + +class STResNetBase(ResNetBase): + + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + def __init__(self, in_channels, out_channels, config, D=4, **kwargs): + super().__init__(in_channels, out_channels, config, D, **kwargs) + + +class STResNet14(STResNetBase, ResNet14): + pass + + +class STResNet18(STResNetBase, ResNet18): + pass + + +class STResNet34(STResNetBase, ResNet34): + pass + + +class STResNet50(STResNetBase, ResNet50): + pass + + +class STResNet101(STResNetBase, ResNet101): + pass + + +class STResTesseractNetBase(STResNetBase): + CONV_TYPE = ConvType.HYPERCUBE + + +class STResTesseractNet14(STResTesseractNetBase, STResNet14): + pass + + +class STResTesseractNet18(STResTesseractNetBase, STResNet18): + pass + + +class STResTesseractNet34(STResTesseractNetBase, STResNet34): + pass + + +class STResTesseractNet50(STResTesseractNetBase, STResNet50): + pass + + +class STResTesseractNet101(STResTesseractNetBase, STResNet101): + pass diff --git a/models/Mask3D/mask3d/models/resnet.py.tmp b/models/Mask3D/mask3d/models/resnet.py.tmp new file mode 100644 index 0000000000000000000000000000000000000000..5208c1f576bdd81528b305a27dc9302b867d853f --- /dev/null +++ b/models/Mask3D/mask3d/models/resnet.py.tmp @@ -0,0 +1,240 @@ +import torch.nn as nn +import MinkowskiEngine as ME + +from models.model import Model +from models.modules.common import ConvType, NormType, conv, get_norm, sum_pool +from models.modules.resnet_block import BasicBlock, Bottleneck + + +class ResNetBase(Model): + BLOCK = None + LAYERS = () + INIT_DIM = 64 + PLANES = (64, 128, 256, 512) + OUT_PIXEL_DIST = 32 + HAS_LAST_BLOCK = False + CONV_TYPE = ConvType.HYPERCUBE + + def __init__(self, in_channels, out_channels, config, D=3, **kwargs): + assert self.BLOCK is not None + assert self.OUT_PIXEL_DIST > 0 + + super().__init__(in_channels, out_channels, config, D, **kwargs) + + self.network_initialization(in_channels, out_channels, config, D) + self.weight_initialization() + + def network_initialization(self, in_channels, out_channels, config, D): + def space_n_time_m(n, m): + return n if D == 3 else [n, n, n, m] + + if D == 4: + self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1) + + dilations = config.dilations + bn_momentum = config.bn_momentum + self.inplanes = self.INIT_DIM + self.conv1 = conv( + in_channels, + self.inplanes, + kernel_size=space_n_time_m(config.conv1_kernel_size, 1), + stride=1, + D=D, + ) + + self.bn1 = get_norm( + NormType.BATCH_NORM, self.inplanes, D=self.D, bn_momentum=bn_momentum + ) + self.relu = ME.MinkowskiReLU(inplace=True) + self.pool = sum_pool( + kernel_size=space_n_time_m(2, 1), stride=space_n_time_m(2, 1), D=D + ) + + self.layer1 = self._make_layer( + self.BLOCK, + self.PLANES[0], + self.LAYERS[0], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[0], 1), + ) + self.layer2 = self._make_layer( + self.BLOCK, + self.PLANES[1], + self.LAYERS[1], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[1], 1), + ) + self.layer3 = self._make_layer( + self.BLOCK, + self.PLANES[2], + self.LAYERS[2], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[2], 1), + ) + self.layer4 = self._make_layer( + self.BLOCK, + self.PLANES[3], + self.LAYERS[3], + stride=space_n_time_m(2, 1), + dilation=space_n_time_m(dilations[3], 1), + ) + + self.final = conv( + self.PLANES[3] * self.BLOCK.expansion, + out_channels, + kernel_size=1, + bias=True, + D=D, + ) + + def weight_initialization(self): + for m in self.modules(): + if isinstance(m, ME.MinkowskiBatchNorm): + nn.init.constant_(m.bn.weight, 1) + nn.init.constant_(m.bn.bias, 0) + + def _make_layer( + self, + block, + planes, + blocks, + stride=1, + dilation=1, + norm_type=NormType.BATCH_NORM, + bn_momentum=0.1, + ): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + D=self.D, + ), + get_norm( + norm_type, + planes * block.expansion, + D=self.D, + bn_momentum=bn_momentum, + ), + ) + layers = [] + layers.append( + block( + self.inplanes, + planes, + stride=stride, + dilation=dilation, + downsample=downsample, + conv_type=self.CONV_TYPE, + D=self.D, + ) + ) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + self.inplanes, + planes, + stride=1, + dilation=dilation, + conv_type=self.CONV_TYPE, + D=self.D, + ) + ) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.pool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.final(x) + return x + + +class ResNet14(ResNetBase): + BLOCK = BasicBlock + LAYERS = (1, 1, 1, 1) + + +class ResNet18(ResNetBase): + BLOCK = BasicBlock + LAYERS = (2, 2, 2, 2) + + +class ResNet34(ResNetBase): + BLOCK = BasicBlock + LAYERS = (3, 4, 6, 3) + + +class ResNet50(ResNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 6, 3) + + +class ResNet101(ResNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 23, 3) + + +class STResNetBase(ResNetBase): + + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + def __init__(self, in_channels, out_channels, config, D=4, **kwargs): + super().__init__(in_channels, out_channels, config, D, **kwargs) + + +class STResNet14(STResNetBase, ResNet14): + pass + + +class STResNet18(STResNetBase, ResNet18): + pass + + +class STResNet34(STResNetBase, ResNet34): + pass + + +class STResNet50(STResNetBase, ResNet50): + pass + + +class STResNet101(STResNetBase, ResNet101): + pass + + +class STResTesseractNetBase(STResNetBase): + CONV_TYPE = ConvType.HYPERCUBE + + +class STResTesseractNet14(STResTesseractNetBase, STResNet14): + pass + + +class STResTesseractNet18(STResTesseractNetBase, STResNet18): + pass + + +class STResTesseractNet34(STResTesseractNetBase, STResNet34): + pass + + +class STResTesseractNet50(STResTesseractNetBase, STResNet50): + pass + + +class STResTesseractNet101(STResTesseractNetBase, STResNet101): + pass diff --git a/models/Mask3D/mask3d/models/resunet.py b/models/Mask3D/mask3d/models/resunet.py new file mode 100644 index 0000000000000000000000000000000000000000..98a3adc56f09d534256960c080594e5df3a41c7c --- /dev/null +++ b/models/Mask3D/mask3d/models/resunet.py @@ -0,0 +1,617 @@ +import torch.nn as nn +import MinkowskiEngine as ME +import MinkowskiEngine.MinkowskiOps as me +from MinkowskiEngine import MinkowskiReLU + +from mask3d.models.resnet import ResNetBase, get_norm +from mask3d.models.modules.common import ConvType, NormType, conv, conv_tr +from mask3d.models.modules.resnet_block import BasicBlock, Bottleneck, BasicBlockINBN + + +class MinkUNetBase(ResNetBase): + BLOCK = None + PLANES = (64, 128, 256, 512, 256, 128, 128) + DILATIONS = (1, 1, 1, 1, 1, 1) + LAYERS = (2, 2, 2, 2, 2, 2) + INIT_DIM = 64 + OUT_PIXEL_DIST = 1 + NORM_TYPE = NormType.BATCH_NORM + NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + # To use the model, must call initialize_coords before forward pass. + # Once data is processed, call clear to reset the model before calling initialize_coords + def __init__(self, in_channels, out_channels, config, D=3, **kwargs): + super().__init__(in_channels, out_channels, config, D) + + def network_initialization(self, in_channels, out_channels, config, D): + # Setup net_metadata + dilations = self.DILATIONS + bn_momentum = config.bn_momentum + + def space_n_time_m(n, m): + return n if D == 3 else [n, n, n, m] + + if D == 4: + self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1) + + # Output of the first conv concated to conv6 + self.inplanes = self.INIT_DIM + self.conv1p1s1 = conv( + in_channels, + self.inplanes, + kernel_size=space_n_time_m(config.conv1_kernel_size, 1), + stride=1, + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + + self.bn1 = get_norm( + self.NORM_TYPE, self.PLANES[0], D, bn_momentum=bn_momentum + ) + self.block1 = self._make_layer( + self.BLOCK, + self.PLANES[0], + self.LAYERS[0], + dilation=dilations[0], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv2p1s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn2 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block2 = self._make_layer( + self.BLOCK, + self.PLANES[1], + self.LAYERS[1], + dilation=dilations[1], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv3p2s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn3 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block3 = self._make_layer( + self.BLOCK, + self.PLANES[2], + self.LAYERS[2], + dilation=dilations[2], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv4p4s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn4 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block4 = self._make_layer( + self.BLOCK, + self.PLANES[3], + self.LAYERS[3], + dilation=dilations[3], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr4p8s2 = conv_tr( + self.inplanes, + self.PLANES[4], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr4 = get_norm( + self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion + self.block5 = self._make_layer( + self.BLOCK, + self.PLANES[4], + self.LAYERS[4], + dilation=dilations[4], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr5p4s2 = conv_tr( + self.inplanes, + self.PLANES[5], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr5 = get_norm( + self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion + self.block6 = self._make_layer( + self.BLOCK, + self.PLANES[5], + self.LAYERS[5], + dilation=dilations[5], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.convtr6p2s2 = conv_tr( + self.inplanes, + self.PLANES[6], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr6 = get_norm( + self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum + ) + self.relu = MinkowskiReLU(inplace=True) + + self.final = nn.Sequential( + conv( + self.PLANES[6] + self.PLANES[0] * self.BLOCK.expansion, + 512, + kernel_size=1, + stride=1, + dilation=1, + bias=False, + D=D, + ), + ME.MinkowskiBatchNorm(512), + ME.MinkowskiReLU(), + conv( + 512, + out_channels, + kernel_size=1, + stride=1, + dilation=1, + bias=True, + D=D, + ), + ) + + def forward(self, x): + out = self.conv1p1s1(x) + out = self.bn1(out) + out = self.relu(out) + + out_b1p1 = self.block1(out) + + out = self.conv2p1s2(out_b1p1) + out = self.bn2(out) + out = self.relu(out) + + out_b2p2 = self.block2(out) + + out = self.conv3p2s2(out_b2p2) + out = self.bn3(out) + out = self.relu(out) + + out_b3p4 = self.block3(out) + + out = self.conv4p4s2(out_b3p4) + out = self.bn4(out) + out = self.relu(out) + + # pixel_dist=8 + out = self.block4(out) + + out = self.convtr4p8s2(out) + out = self.bntr4(out) + out = self.relu(out) + + out = me.cat(out, out_b3p4) + out = self.block5(out) + + out = self.convtr5p4s2(out) + out = self.bntr5(out) + out = self.relu(out) + + out = me.cat(out, out_b2p2) + out = self.block6(out) + + out = self.convtr6p2s2(out) + out = self.bntr6(out) + out = self.relu(out) + + out = me.cat(out, out_b1p1) + return self.final(out) + + +class ResUNet14(MinkUNetBase): + BLOCK = BasicBlock + LAYERS = (1, 1, 1, 1, 1, 1) + + +class ResUNet18(MinkUNetBase): + BLOCK = BasicBlock + LAYERS = (2, 2, 2, 2, 2, 2) + + +class ResUNet18INBN(ResUNet18): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM + BLOCK = BasicBlockINBN + + +class ResUNet34(MinkUNetBase): + BLOCK = BasicBlock + LAYERS = (3, 4, 6, 3, 2, 2) + + +class ResUNet50(MinkUNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 6, 3, 2, 2) + + +class ResUNet101(MinkUNetBase): + BLOCK = Bottleneck + LAYERS = (3, 4, 23, 3, 2, 2) + + +class ResUNet14D(ResUNet14): + PLANES = (64, 128, 256, 512, 512, 512, 512) + + +class ResUNet18D(ResUNet18): + PLANES = (64, 128, 256, 512, 512, 512, 512) + + +class ResUNet34D(ResUNet34): + PLANES = (64, 128, 256, 512, 512, 512, 512) + + +class ResUNet34E(ResUNet34): + INIT_DIM = 32 + PLANES = (32, 64, 128, 256, 128, 64, 64) + + +class ResUNet34F(ResUNet34): + INIT_DIM = 32 + PLANES = (32, 64, 128, 256, 128, 64, 32) + + +class MinkUNetHyper(MinkUNetBase): + BLOCK = None + PLANES = (64, 128, 256, 512, 256, 128, 128) + DILATIONS = (1, 1, 1, 1, 1, 1) + LAYERS = (2, 2, 2, 2, 2, 2) + INIT_DIM = 64 + OUT_PIXEL_DIST = 1 + NORM_TYPE = NormType.BATCH_NORM + NON_BLOCK_CONV_TYPE = ConvType.SPATIAL_HYPERCUBE + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + # To use the model, must call initialize_coords before forward pass. + # Once data is processed, call clear to reset the model before calling initialize_coords + def __init__(self, in_channels, out_channels, config, D=3, **kwargs): + super(MinkUNetBase, self).__init__( + in_channels, out_channels, config, D + ) + + def network_initialization(self, in_channels, out_channels, config, D): + # Setup net_metadata + dilations = self.DILATIONS + bn_momentum = config.bn_momentum + + def space_n_time_m(n, m): + return n if D == 3 else [n, n, n, m] + + if D == 4: + self.OUT_PIXEL_DIST = space_n_time_m(self.OUT_PIXEL_DIST, 1) + + # Output of the first conv concated to conv6 + self.inplanes = self.INIT_DIM + self.conv1p1s1 = conv( + in_channels, + self.inplanes, + kernel_size=space_n_time_m(config.conv1_kernel_size, 1), + stride=1, + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + + self.bn1 = get_norm( + self.NORM_TYPE, self.PLANES[0], D, bn_momentum=bn_momentum + ) + self.block1 = self._make_layer( + self.BLOCK, + self.PLANES[0], + self.LAYERS[0], + dilation=dilations[0], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv2p1s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn2 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block2 = self._make_layer( + self.BLOCK, + self.PLANES[1], + self.LAYERS[1], + dilation=dilations[1], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv3p2s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn3 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block3 = self._make_layer( + self.BLOCK, + self.PLANES[2], + self.LAYERS[2], + dilation=dilations[2], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + + self.conv4p4s2 = conv( + self.inplanes, + self.inplanes, + kernel_size=space_n_time_m(2, 1), + stride=space_n_time_m(2, 1), + dilation=1, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bn4 = get_norm( + self.NORM_TYPE, self.inplanes, D, bn_momentum=bn_momentum + ) + self.block4 = self._make_layer( + self.BLOCK, + self.PLANES[3], + self.LAYERS[3], + dilation=dilations[3], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.pool_tr4 = ME.MinkowskiPoolingTranspose( + kernel_size=8, stride=8, dimension=D + ) + _ = self.inplanes + self.convtr4p8s2 = conv_tr( + self.inplanes, + self.PLANES[4], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr4 = get_norm( + self.NORM_TYPE, self.PLANES[4], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion + self.block5 = self._make_layer( + self.BLOCK, + self.PLANES[4], + self.LAYERS[4], + dilation=dilations[4], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.pool_tr5 = ME.MinkowskiPoolingTranspose( + kernel_size=4, stride=4, dimension=D + ) + out_pool5 = self.inplanes + self.convtr5p4s2 = conv_tr( + self.inplanes, + self.PLANES[5], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr5 = get_norm( + self.NORM_TYPE, self.PLANES[5], D, bn_momentum=bn_momentum + ) + + self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion + self.block6 = self._make_layer( + self.BLOCK, + self.PLANES[5], + self.LAYERS[5], + dilation=dilations[5], + norm_type=self.NORM_TYPE, + bn_momentum=bn_momentum, + ) + self.pool_tr6 = ME.MinkowskiPoolingTranspose( + kernel_size=2, stride=2, dimension=D + ) + out_pool6 = self.inplanes + self.convtr6p2s2 = conv_tr( + self.inplanes, + self.PLANES[6], + kernel_size=space_n_time_m(2, 1), + upsample_stride=space_n_time_m(2, 1), + dilation=1, + bias=False, + conv_type=self.NON_BLOCK_CONV_TYPE, + D=D, + ) + self.bntr6 = get_norm( + self.NORM_TYPE, self.PLANES[6], D, bn_momentum=bn_momentum + ) + + self.relu = MinkowskiReLU(inplace=True) + + self.final = nn.Sequential( + conv( + out_pool5 + + out_pool6 + + self.PLANES[6] + + self.PLANES[0] * self.BLOCK.expansion, + 512, + kernel_size=1, + bias=False, + D=D, + ), + ME.MinkowskiBatchNorm(512), + ME.MinkowskiReLU(), + conv(512, out_channels, kernel_size=1, bias=True, D=D), + ) + + def forward(self, x): + out = self.conv1p1s1(x) + out = self.bn1(out) + out = self.relu(out) + + out_b1p1 = self.block1(out) + + out = self.conv2p1s2(out_b1p1) + out = self.bn2(out) + out = self.relu(out) + + out_b2p2 = self.block2(out) + + out = self.conv3p2s2(out_b2p2) + out = self.bn3(out) + out = self.relu(out) + + out_b3p4 = self.block3(out) + + out = self.conv4p4s2(out_b3p4) + out = self.bn4(out) + out = self.relu(out) + + # pixel_dist=8 + out = self.block4(out) + + out = self.convtr4p8s2(out) + out = self.bntr4(out) + out = self.relu(out) + + out = me.cat(out, out_b3p4) + out = self.block5(out) + out_5 = self.pool_tr5(out) + + out = self.convtr5p4s2(out) + out = self.bntr5(out) + out = self.relu(out) + + out = me.cat(out, out_b2p2) + out = self.block6(out) + out_6 = self.pool_tr6(out) + + out = self.convtr6p2s2(out) + out = self.bntr6(out) + out = self.relu(out) + + out = me.cat(out, out_b1p1, out_6, out_5) + return self.final(out) + + +class MinkUNetHyper14INBN(MinkUNetHyper): + NORM_TYPE = NormType.INSTANCE_BATCH_NORM + BLOCK = BasicBlockINBN + + +class STMinkUNetBase(MinkUNetBase): + + CONV_TYPE = ConvType.SPATIAL_HYPERCUBE_TEMPORAL_HYPERCROSS + + def __init__(self, in_channels, out_channels, config, D=4, **kwargs): + super().__init__(in_channels, out_channels, config, D, **kwargs) + + +class STResUNet14(STMinkUNetBase, ResUNet14): + pass + + +class STResUNet18(STMinkUNetBase, ResUNet18): + pass + + +class STResUNet34(STMinkUNetBase, ResUNet34): + pass + + +class STResUNet50(STMinkUNetBase, ResUNet50): + pass + + +class STResUNet101(STMinkUNetBase, ResUNet101): + pass + + +class STResTesseractUNetBase(STMinkUNetBase): + CONV_TYPE = ConvType.HYPERCUBE + + +class STResTesseractUNet14(STResTesseractUNetBase, ResUNet14): + pass + + +class STResTesseractUNet18(STResTesseractUNetBase, ResUNet18): + pass + + +class STResTesseractUNet34(STResTesseractUNetBase, ResUNet34): + pass + + +class STResTesseractUNet50(STResTesseractUNetBase, ResUNet50): + pass + + +class STResTesseractUNet101(STResTesseractUNetBase, ResUNet101): + pass diff --git a/models/Mask3D/mask3d/models/wrapper.py b/models/Mask3D/mask3d/models/wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..a6bf1678d2106049b8e6a2ac2f3a9aff37dcfc9c --- /dev/null +++ b/models/Mask3D/mask3d/models/wrapper.py @@ -0,0 +1,32 @@ +import random + +from torch.nn import Module +from MinkowskiEngine import SparseTensor + + +class Wrapper(Module): + """ + Wrapper for the segmentation networks. + """ + + OUT_PIXEL_DIST = -1 + + def __init__(self, NetClass, in_nchannel, out_nchannel, config): + super().__init__() + self.initialize_filter(NetClass, in_nchannel, out_nchannel, config) + + def initialize_filter(self, NetClass, in_nchannel, out_nchannel, config): + raise NotImplementedError("Must initialize a model and a filter") + + def forward(self, x, coords, colors=None): + soutput = self.model(x) + + # During training, make the network invariant to the filter + if not self.training or random.random() < 0.5: + # Filter requires the model to finish the forward pass + wrapper_coords = self.filter.initialize_coords( + self.model, coords, colors + ) + finput = SparseTensor(soutput.F, wrapper_coords) + soutput = self.filter(finput) + return soutput diff --git a/models/Mask3D/mask3d/predict.py b/models/Mask3D/mask3d/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..4c085fd01897c13540da8eac9f941dcf0847ca6f --- /dev/null +++ b/models/Mask3D/mask3d/predict.py @@ -0,0 +1,187 @@ +import hydra +from omegaconf import DictConfig, OmegaConf +from models.mask3d import Mask3D +import os +import torch + +import MinkowskiEngine as ME +import open3d as o3d +import numpy as np +import albumentations as A + +from utils.utils import ( + flatten_dict, + load_baseline_model, + load_checkpoint_with_missing_or_exsessive_keys, + load_backbone_checkpoint_with_missing_or_exsessive_keys, +) + +from datasets.scannet200.scannet200_constants import ( + SCANNET_COLOR_MAP_200, + SCANNET_COLOR_MAP_20, + VALID_CLASS_IDS_200, + VALID_CLASS_IDS_20, + CLASS_LABELS_200, + CLASS_LABELS_20, +) + +root_dir = '/home/weders/scratch/scratch/scannetter/arkit/raw/Validation' + +class InstanceSegmentation(torch.nn.Module): + def __init__(self, cfg): + super().__init__() + self.model = hydra.utils.instantiate(cfg.model) + + + def forward(self, x, raw_coordinates=None): + return self.model(x, raw_coordinates=raw_coordinates) + +@hydra.main( + config_path="conf", config_name="config_base_instance_segmentation.yaml" +) +def main(cfg: DictConfig): + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + os.chdir(hydra.utils.get_original_cwd()) + model = InstanceSegmentation(cfg) + + if cfg.general.backbone_checkpoint is not None: + cfg, model = load_backbone_checkpoint_with_missing_or_exsessive_keys( + cfg, model + ) + if cfg.general.checkpoint is not None: + cfg, model = load_checkpoint_with_missing_or_exsessive_keys(cfg, model) + + model = model.to(device) + # model.eval() + + color_mean = (0.47793125906962, 0.4303257521323044, 0.3749598901421883) + color_std = (0.2834475483823543, 0.27566157565723015, 0.27018971370874995) + normalize_color = A.Normalize(mean=color_mean, std=color_std) + + # iterate over data + for sc in os.listdir(root_dir): + + + if not os.path.exists(os.path.join(root_dir, sc, 'mesh_tsdf.ply')): + continue + + # save outputs + output_dir = os.path.join(root_dir, sc, 'pred_mask3d_ours') + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if sc != '42445991': + continue + + # if os.path.exists(os.path.join(output_dir, 'mask3d_predictions.txt')): + # print('Skipping', sc) + # continue + + print('Processing', sc) + + mesh = o3d.io.read_triangle_mesh(os.path.join(root_dir, sc, 'mesh_tsdf.ply')) + mesh.compute_vertex_normals() + + points = np.asarray(mesh.vertices) + colors = np.asarray(mesh.vertex_colors) + + + colors = colors * 255. + pseudo_image = colors.astype(np.uint8)[np.newaxis, :, :] + colors = np.squeeze(normalize_color(image=pseudo_image)["image"]) + + # voxelize data + coords = np.floor(points / 0.02) + + # maybe this change (_, _, ...) is not necessary and we can directly get out + # the sample coordinates? + _, _, unique_map, inverse_map = ME.utils.sparse_quantize(coordinates=coords, features=colors, return_index=True, return_inverse=True) + + sample_coordinates = coords[unique_map] + coordinates = [torch.from_numpy(sample_coordinates).int()] + sample_features = colors[unique_map] + features = [torch.from_numpy(sample_features).float()] + + coordinates, _ = ME.utils.sparse_collate(coords=coordinates, feats=features) + features = torch.cat(features, dim=0) + data = ME.SparseTensor( + coordinates=coordinates, + features=features, + device=device, + ) + + # run model + with torch.no_grad(): + outputs = model(data, raw_coordinates=features) + + del data + torch.cuda.empty_cache() + + # parse predictions + logits = outputs["pred_logits"] + masks = outputs["pred_masks"] + + + # reformat predictions + logits = logits[0].detach().cpu() + masks = masks[0].detach().cpu() + + labels = [] + confidences = [] + masks_binary = [] + + for i in range(len(logits)): + p_labels = torch.softmax(logits[i], dim=-1) + p_masks = torch.sigmoid(masks[:, i]) + l = torch.argmax(p_labels, dim=-1) + c_label = torch.max(p_labels) + m = p_masks > 0.5 + c_m = p_masks[m].sum() / (m.sum() + 1e-8) + c = c_label * c_m + if l < 200 and c > 0.5: + labels.append(l.item()) + confidences.append(c.item()) + masks_binary.append(m[inverse_map]) # mapping the mask back to the original point cloud + + + # save labelled mesh + mesh_labelled = o3d.geometry.TriangleMesh() + mesh_labelled.vertices = mesh.vertices + mesh_labelled.triangles = mesh.triangles + + labels_mapped = np.zeros((len(mesh.vertices), 1)) + colors_mapped = np.zeros((len(mesh.vertices), 3)) + + confidences, labels, masks_binary = zip(*sorted(zip(confidences, labels, masks_binary), reverse=False)) + for i, (l, c, m) in enumerate(zip(labels, confidences, masks_binary)): + labels_mapped[m == 1] = l + if l == 0: + l_ = -1 + 2 # label offset is 2 for scannet 200, 0 needs to be mapped to -1 before (see trainer.py in Mask3D) + else: + l_ = l + 2 + # print(VALID_CLASS_IDS_200[l_], SCANNET_COLOR_MAP_200[VALID_CLASS_IDS_200[l_]], l_, CLASS_LABELS_200[l_]) + colors_mapped[m == 1] = SCANNET_COLOR_MAP_200[VALID_CLASS_IDS_200[l_]] + + # colors_mapped[mask_mapped == 1] = SCANNET_COLOR_MAP_200[VALID_CLASS_IDS_200[l]] + + + + + mesh_labelled.vertex_colors = o3d.utility.Vector3dVector(colors_mapped.astype(np.float32) / 255.) + o3d.io.write_triangle_mesh(f'{output_dir}/mesh_tsdf_labelled.ply', mesh_labelled) + + mask_path = os.path.join(output_dir, 'pred_mask') + if not os.path.exists(mask_path): + os.makedirs(mask_path) + + # sorting by confidence + with open(os.path.join(output_dir, 'mask3d_predictions.txt'), 'w') as f: + for i, (l, c, m) in enumerate(zip(labels, confidences, masks_binary)): + mask_file = f'pred_mask/{str(i).zfill(3)}.txt' + f.write(f'{mask_file} {VALID_CLASS_IDS_200[l]} {c}\n') + np.savetxt(os.path.join(output_dir, mask_file), m.numpy(), fmt='%d') + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/models/Mask3D/mask3d/preprocess_arkitscenes.py b/models/Mask3D/mask3d/preprocess_arkitscenes.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/mask3d/scripts/arkitscenes/test.sh b/models/Mask3D/mask3d/scripts/arkitscenes/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..64cee20547d22a6502ade31c199f342121c59c4b --- /dev/null +++ b/models/Mask3D/mask3d/scripts/arkitscenes/test.sh @@ -0,0 +1,23 @@ +export OMP_NUM_THREADS=3 # speeds up MinkowskiEngine + +CURR_DBSCAN=0.95 +CURR_TOPK=750 +CURR_QUERY=150 + +python predict.py \ +general.experiment_name="arkitscenes" \ +general.project_name="arktiscenes" \ +general.checkpoint="checkpoints/scannet200/scannet200_benchmark.ckpt" \ +data/datasets=scannet200 \ +general.num_targets=201 \ +data.num_labels=200 \ +general.eval_on_segments=false \ +general.train_on_segments=false \ +general.train_mode=false \ +model.num_queries=${CURR_QUERY} \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} \ +general.export=true \ +data.test_mode=test \ +general.export_threshold=${CURR_T} \ No newline at end of file diff --git a/models/Mask3D/mask3d/scripts/s3dis/s3dis_from_scratch.sh b/models/Mask3D/mask3d/scripts/s3dis/s3dis_from_scratch.sh new file mode 100644 index 0000000000000000000000000000000000000000..373e067d050bd30a904fa955d3ea26f9414c0f2a --- /dev/null +++ b/models/Mask3D/mask3d/scripts/s3dis/s3dis_from_scratch.sh @@ -0,0 +1,33 @@ +#!/bin/bash +export OMP_NUM_THREADS=3 # speeds up MinkowskiEngine + +CURR_AREA=1 # set the area number accordingly [1,6] +CURR_DBSCAN=0.6 +CURR_TOPK=-1 +CURR_QUERY=100 + +python main_instance_segmentation.py \ + general.project_name="s3dis" \ + general.experiment_name="area${CURR_AREA}_from_scratch" \ + data.batch_size=4 \ + data/datasets=s3dis \ + general.num_targets=14 \ + data.num_labels=13 \ + trainer.max_epochs=1001 \ + general.area=${CURR_AREA} \ + trainer.check_val_every_n_epoch=10 + +python main_instance_segmentation.py \ +general.project_name="s3dis_eval" \ +general.experiment_name="area${CURR_AREA}_from_scratch_eps_${CURR_DBSCAN}_topk_${CURR_TOPK}_q_${CURR_QUERY}" \ +general.checkpoint="checkpoints/s3dis/from_scratch/area${CURR_AREA}.ckpt" \ +general.train_mode=false \ +data.batch_size=4 \ +data/datasets=s3dis \ +general.num_targets=14 \ +data.num_labels=13 \ +general.area=${CURR_AREA} \ +model.num_queries=${CURR_QUERY} \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} diff --git a/models/Mask3D/mask3d/scripts/s3dis/s3dis_pretrained.sh b/models/Mask3D/mask3d/scripts/s3dis/s3dis_pretrained.sh new file mode 100644 index 0000000000000000000000000000000000000000..f5a1d08d8a4a17f9d6aa2f88c5043d23bd9b1fed --- /dev/null +++ b/models/Mask3D/mask3d/scripts/s3dis/s3dis_pretrained.sh @@ -0,0 +1,34 @@ +#!/bin/bash +export OMP_NUM_THREADS=3 # speeds up MinkowskiEngine + +CURR_AREA=1 # set the area number accordingly [1,6] +CURR_DBSCAN=0.6 +CURR_TOPK=-1 +CURR_QUERY=100 + +python main_instance_segmentation.py \ + general.project_name="s3dis" \ + general.experiment_name="area${CURR_AREA}_pretrained" \ + data.batch_size=4 \ + data/datasets=s3dis \ + general.num_targets=14 \ + data.num_labels=13 \ + general.area=${CURR_AREA} \ + general.checkpoint="checkpoints/s3dis/scannet_pretrained/scannet_pretrained.ckpt" \ + trainer.check_val_every_n_epoch=10 \ + optimizer.lr=0.00001 + +python main_instance_segmentation.py \ +general.project_name="s3dis_eval" \ +general.experiment_name="area${CURR_AREA}_pretrained_eps_${CURR_DBSCAN}_topk_${CURR_TOPK}_q_${CURR_QUERY}" \ +general.checkpoint="checkpoints/s3dis/scannet_pretrained/area${CURR_AREA}.ckpt" \ +general.train_mode=false \ +data.batch_size=4 \ +data/datasets=s3dis \ +general.num_targets=14 \ +data.num_labels=13 \ +general.area=${CURR_AREA} \ +model.num_queries=${CURR_QUERY} \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} diff --git a/models/Mask3D/mask3d/scripts/scannet/scannet_benchmark.sh b/models/Mask3D/mask3d/scripts/scannet/scannet_benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..d8a45ba9717a5488b3a387dc2f29028de6c1c5ae --- /dev/null +++ b/models/Mask3D/mask3d/scripts/scannet/scannet_benchmark.sh @@ -0,0 +1,28 @@ +#!/bin/bash +export OMP_NUM_THREADS=3 # speeds up MinkowskiEngine + +CURR_DBSCAN=0.95 +CURR_TOPK=300 +CURR_QUERY=150 + +# TRAIN +python main_instance_segmentation.py \ +general.experiment_name="benchmark" \ +general.eval_on_segments=true \ +general.train_on_segments=true \ +data.train_mode=train_validation + +# TEST +python main_instance_segmentation.py \ +general.experiment_name="benchmark_query_${CURR_QUERY}_topk_${CURR_TOPK}_dbscan_${CURR_DBSCAN}" \ +general.project_name="scannet_eval" \ +general.checkpoint='checkpoints/scannet/scannet_benchmark.ckpt' \ +general.eval_on_segments=true \ +general.train_on_segments=true \ +general.train_mode=false \ +general.export=true \ +data.test_mode=test \ +model.num_queries=${CURR_QUERY} \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} diff --git a/models/Mask3D/mask3d/scripts/scannet/scannet_pretrain_for_s3dis.sh b/models/Mask3D/mask3d/scripts/scannet/scannet_pretrain_for_s3dis.sh new file mode 100644 index 0000000000000000000000000000000000000000..cfb1c1312257a7a4415c528d4935f160796e4ecf --- /dev/null +++ b/models/Mask3D/mask3d/scripts/scannet/scannet_pretrain_for_s3dis.sh @@ -0,0 +1,7 @@ +#!/bin/bash +export OMP_NUM_THREADS=3 # speeds up MinkowskiEngine + +# TRAIN +python main_instance_segmentation.py \ +general.experiment_name="pretrain_for_s3dis" \ +data.train_mode=train_validation \ No newline at end of file diff --git a/models/Mask3D/mask3d/scripts/scannet/scannet_val.sh b/models/Mask3D/mask3d/scripts/scannet/scannet_val.sh new file mode 100644 index 0000000000000000000000000000000000000000..8c82a26204f145f6eb20bd9fa2a1f632cdaea77d --- /dev/null +++ b/models/Mask3D/mask3d/scripts/scannet/scannet_val.sh @@ -0,0 +1,25 @@ +#!/bin/bash +export OMP_NUM_THREADS=3 # speeds up MinkowskiEngine + +CURR_DBSCAN=0.95 +CURR_TOPK=500 +CURR_QUERY=150 + +# TRAIN +python main_instance_segmentation.py \ +general.experiment_name="validation" \ +general.eval_on_segments=true \ +general.train_on_segments=true + +# TEST +python main_instance_segmentation.py \ +general.experiment_name="validation_query_${CURR_QUERY}_topk_${CURR_TOPK}_dbscan_${CURR_DBSCAN}" \ +general.project_name="scannet_eval" \ +general.checkpoint='checkpoints/scannet/scannet_val.ckpt' \ +general.train_mode=false \ +general.eval_on_segments=true \ +general.train_on_segments=true \ +model.num_queries=${CURR_QUERY} \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} diff --git a/models/Mask3D/mask3d/scripts/scannet200/scannet200_benchmark.sh b/models/Mask3D/mask3d/scripts/scannet200/scannet200_benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..7177d4a6742d485f63e5b878aeb292babf3364d5 --- /dev/null +++ b/models/Mask3D/mask3d/scripts/scannet200/scannet200_benchmark.sh @@ -0,0 +1,37 @@ +#!/bin/bash +export OMP_NUM_THREADS=3 # speeds up MinkowskiEngine + +CURR_DBSCAN=0.95 +CURR_TOPK=300 +CURR_QUERY=150 +CURR_T=0.001 + +# TRAIN +python main_instance_segmentation.py \ +general.experiment_name="scannet200_benchmark" \ +general.project_name="scannet200" \ +data/datasets=scannet200 \ +general.num_targets=201 \ +data.num_labels=200 \ +general.eval_on_segments=true \ +general.train_on_segments=true \ +data.train_mode=train_validation + +# TEST +python main_instance_segmentation.py \ +general.experiment_name="scannet200_benchmark_query_${CURR_QUERY}_topk_${CURR_TOPK}_dbscan_${CURR_DBSCAN}_export_${CURR_T}" \ +general.project_name="scannet200_eval" \ +general.checkpoint="checkpoints/scannet200/scannet200_benchmark.ckpt" \ +data/datasets=scannet200 \ +general.num_targets=201 \ +data.num_labels=200 \ +general.eval_on_segments=true \ +general.train_on_segments=true \ +general.train_mode=false \ +model.num_queries=${CURR_QUERY} \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} \ +general.export=true \ +data.test_mode=test \ +general.export_threshold=${CURR_T} diff --git a/models/Mask3D/mask3d/scripts/scannet200/scannet200_val.sh b/models/Mask3D/mask3d/scripts/scannet200/scannet200_val.sh new file mode 100644 index 0000000000000000000000000000000000000000..80f030f575c6080e1f74316a6f126e66702e5b59 --- /dev/null +++ b/models/Mask3D/mask3d/scripts/scannet200/scannet200_val.sh @@ -0,0 +1,32 @@ +#!/bin/bash +export OMP_NUM_THREADS=3 # speeds up MinkowskiEngine + +CURR_DBSCAN=0.95 +CURR_TOPK=750 +CURR_QUERY=150 + +# TRAIN +python main_instance_segmentation.py \ +general.experiment_name="scannet200_val" \ +general.project_name="scannet200" \ +data/datasets=scannet200 \ +general.num_targets=201 \ +data.num_labels=200 \ +general.eval_on_segments=true \ +general.train_on_segments=true + +# TEST +python main_instance_segmentation.py \ +general.experiment_name="scannet200_val_query_${CURR_QUERY}_topk_${CURR_TOPK}_dbscan_${CURR_DBSCAN}" \ +general.project_name="scannet200_eval" \ +general.checkpoint="checkpoints/scannet200/scannet200_val.ckpt" \ +data/datasets=scannet200 \ +general.num_targets=201 \ +data.num_labels=200 \ +general.eval_on_segments=true \ +general.train_on_segments=true \ +general.train_mode=false \ +model.num_queries=${CURR_QUERY} \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} diff --git a/models/Mask3D/mask3d/scripts/stpls3d/merge_exports.py b/models/Mask3D/mask3d/scripts/stpls3d/merge_exports.py new file mode 100644 index 0000000000000000000000000000000000000000..7a314a3b563d0f19cf1f0c6e0ce522d4df9c5bea --- /dev/null +++ b/models/Mask3D/mask3d/scripts/stpls3d/merge_exports.py @@ -0,0 +1,55 @@ +import os +import shutil +from glob import glob +from tqdm import tqdm + +base_path = "INSERT_WORKING_DIRECTORY" +vs03 = f"{base_path}/benchmark_03" +vs02 = f"{base_path}/benchmark_02" + +target_path = "INSERT_TARGET_DIRECTORY" + +print("COPY MASKS FILES 1/2 ...") +shutil.copytree(f"{vs02}/pred_mask", f"{target_path}/pred_mask_02") +print("COPY MASKS FILES 2/2 ...") +shutil.copytree(f"{vs03}/pred_mask", f"{target_path}/pred_mask_03") + +for scene03 in tqdm(glob(f"{vs03}/*.txt")): + instances = [] + with open(scene03, "r") as file03: + while line := file03.readline().rstrip(): + mask_path, class_id, score = line.split(" ") + + if int(class_id) in [1, 3, 4, 7, 8, 11, 12, 13]: + instances.append( + f'{mask_path.replace("pred_mask", "pred_mask_03")} {class_id} {score}' + ) + print(instances[-1]) + else: + print( + f'DELETE {target_path}/{mask_path.replace("pred_mask", "pred_mask_03")}' + ) + os.remove( + f'{target_path}/{mask_path.replace("pred_mask", "pred_mask_03")}' + ) + + with open(f'{vs02}/{scene03.split("/")[-1]}', "r") as file02: + while line := file02.readline().rstrip(): + mask_path, class_id, score = line.split(" ") + + if int(class_id) not in [1, 3, 4, 7, 8, 11, 12, 13]: + instances.append( + f'{mask_path.replace("pred_mask", "pred_mask_02")} {class_id} {score}' + ) + print(instances[-1]) + else: + print( + f'DELETE {target_path}/{mask_path.replace("pred_mask", "pred_mask_02")}' + ) + os.remove( + f'{target_path}/{mask_path.replace("pred_mask", "pred_mask_02")}' + ) + + with open(f'{target_path}/{scene03.split("/")[-1]}', "w") as fout: + for line in instances: + fout.write(f"{line}\n") diff --git a/models/Mask3D/mask3d/scripts/stpls3d/stpls3d_benchmark.sh b/models/Mask3D/mask3d/scripts/stpls3d/stpls3d_benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..72443361774e05dc7a85c72754643a934b5891be --- /dev/null +++ b/models/Mask3D/mask3d/scripts/stpls3d/stpls3d_benchmark.sh @@ -0,0 +1,99 @@ +#!/bin/bash +export OMP_NUM_THREADS=3 + +CURR_DBSCAN=12.5 +CURR_TOPK=200 +CURR_QUERY=160 +CURR_SIZE=54 +CURR_THRESHOLD=0.01 + +# TRAIN network 1 with voxel size 0.333 +python main_instance_segmentation.py \ +general.experiment_name="benchmark_03" \ +general.project_name="stpls3d" \ +data/datasets=stpls3d \ +general.num_targets=15 \ +data.num_labels=15 \ +data.voxel_size=0.333 \ +data.num_workers=10 \ +data.cache_data=true \ +data.cropping_v1=false \ +general.reps_per_epoch=100 \ +model.num_queries=${CURR_QUERY} \ +general.on_crops=true \ +model.config.backbone._target_=models.Res16UNet18B \ +data.crop_length=${CURR_SIZE} \ +general.eval_inner_core=50.0 \ +data.train_mode=train_validation + +# TRAIN network 2 with voxel size 0.2 and larger backbone +python main_instance_segmentation.py \ +general.experiment_name="benchmark_02" \ +general.project_name="stpls3d" \ +data/datasets=stpls3d \ +general.num_targets=15 \ +data.num_labels=15 \ +data.voxel_size=0.2 \ +data.num_workers=10 \ +data.cache_data=true \ +data.cropping_v1=false \ +general.reps_per_epoch=100 \ +model.num_queries=${CURR_QUERY} \ +general.on_crops=true \ +data.crop_length=${CURR_SIZE} \ +general.eval_inner_core=50.0 \ +data.train_mode=train_validation + +# TEST network 1 +python main_instance_segmentation.py \ +general.experiment_name="benchmark_03_query_${CURR_QUERY}_topk_${CURR_TOPK}_dbscan_${CURR_DBSCAN}_size_${CURR_SIZE}_T_${CURR_THRESHOLD}" \ +general.project_name="stpls3d_eval" \ +data/datasets=stpls3d \ +general.num_targets=15 \ +data.num_labels=15 \ +data.voxel_size=0.333 \ +data.num_workers=10 \ +data.cache_data=true \ +data.cropping_v1=false \ +general.reps_per_epoch=100 \ +model.num_queries=${CURR_QUERY} \ +general.on_crops=true \ +model.config.backbone._target_=models.Res16UNet18B \ +general.train_mode=false \ +general.checkpoint="checkpoints/stpls3d/stpls3d_benchmark_03.ckpt" \ +data.crop_length=${CURR_SIZE} \ +general.eval_inner_core=50.0 \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} \ +data.test_mode=test \ +general.export=true + +# TEST network 2 +python main_instance_segmentation.py \ +general.experiment_name="benchmark_02_query_${CURR_QUERY}_topk_${CURR_TOPK}_dbscan_${CURR_DBSCAN}_size_${CURR_SIZE}_T_${CURR_THRESHOLD}" \ +general.project_name="stpls3d_eval" \ +data/datasets=stpls3d \ +general.num_targets=15 \ +data.num_labels=15 \ +data.voxel_size=0.2 \ +data.num_workers=10 \ +data.cache_data=true \ +data.cropping_v1=false \ +general.reps_per_epoch=100 \ +model.num_queries=${CURR_QUERY} \ +general.on_crops=true \ +general.train_mode=false \ +general.checkpoint="checkpoints/stpls3d/stpls3d_benchmark_02.ckpt" \ +data.crop_length=${CURR_SIZE} \ +general.eval_inner_core=50.0 \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} \ +data.test_mode=test \ +general.export=true + +# COMBINE OUTPUTS OF ENSEMBLE +# VOXEL SIZE 0.2 FOR OBJECTS OF SMALL CLASSES; VOXEL SIZE 0.333 FOR OBJECTS OF LARGE CLASS CATEGORIES +# TODO FILL IN PATHS +python merge_exports.py diff --git a/models/Mask3D/mask3d/scripts/stpls3d/stpls3d_val.sh b/models/Mask3D/mask3d/scripts/stpls3d/stpls3d_val.sh new file mode 100644 index 0000000000000000000000000000000000000000..4d5cdce1e34537c2d1d3940edb37f7693d55aba1 --- /dev/null +++ b/models/Mask3D/mask3d/scripts/stpls3d/stpls3d_val.sh @@ -0,0 +1,48 @@ +#!/bin/bash +export OMP_NUM_THREADS=3 + +CURR_DBSCAN=14.0 +CURR_TOPK=750 +CURR_QUERY=160 +CURR_SIZE=54 + +# TRAIN +python main_instance_segmentation.py \ +general.experiment_name="validation" \ +general.project_name="stpls3d" \ +data/datasets=stpls3d \ +general.num_targets=15 \ +data.num_labels=15 \ +data.voxel_size=0.333 \ +data.num_workers=10 \ +data.cache_data=true \ +data.cropping_v1=false \ +general.reps_per_epoch=100 \ +model.num_queries=${CURR_QUERY} \ +general.on_crops=true \ +model.config.backbone._target_=models.Res16UNet18B \ +data.crop_length=${CURR_SIZE} \ +general.eval_inner_core=50.0 + +# TEST +python main_instance_segmentation.py \ +general.experiment_name="validation_query_${CURR_QUERY}_topk_${CURR_TOPK}_dbscan_${CURR_DBSCAN}_size_${CURR_SIZE}" \ +general.project_name="stpls3d_eval" \ +data/datasets=stpls3d \ +general.num_targets=15 \ +data.num_labels=15 \ +data.voxel_size=0.333 \ +data.num_workers=10 \ +data.cache_data=true \ +data.cropping_v1=false \ +general.reps_per_epoch=100 \ +model.num_queries=${CURR_QUERY} \ +general.on_crops=true \ +model.config.backbone._target_=models.Res16UNet18B \ +general.train_mode=false \ +general.checkpoint="checkpoints/stpls3d/stpls3d_val.ckpt" \ +data.crop_length=${CURR_SIZE} \ +general.eval_inner_core=50.0 \ +general.topk_per_image=${CURR_TOPK} \ +general.use_dbscan=true \ +general.dbscan_eps=${CURR_DBSCAN} diff --git a/models/Mask3D/mask3d/trainer/__init__.py b/models/Mask3D/mask3d/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/mask3d/trainer/trainer.py b/models/Mask3D/mask3d/trainer/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..b794e38aa5b2cef7eb106f95ced43466768b3dba --- /dev/null +++ b/models/Mask3D/mask3d/trainer/trainer.py @@ -0,0 +1,1302 @@ +import gc +from contextlib import nullcontext +from pathlib import Path +import statistics +import shutil +import os +import math +import pyviz3d.visualizer as vis +from torch_scatter import scatter_mean +import matplotlib +from benchmark.evaluate_semantic_instance import evaluate +from collections import defaultdict +from sklearn.cluster import DBSCAN +from utils.votenet_utils.eval_det import eval_det +from datasets.scannet200.scannet200_splits import ( + HEAD_CATS_SCANNET_200, + TAIL_CATS_SCANNET_200, + COMMON_CATS_SCANNET_200, + VALID_CLASS_IDS_200_VALIDATION, +) + +import hydra +import MinkowskiEngine as ME +import numpy as np +import pytorch_lightning as pl +import torch +from models.metrics import IoU +import random +import colorsys +from typing import List, Tuple +import functools + + +@functools.lru_cache(20) +def get_evenly_distributed_colors( + count: int, +) -> List[Tuple[np.uint8, np.uint8, np.uint8]]: + # lru cache caches color tuples + HSV_tuples = [(x / count, 1.0, 1.0) for x in range(count)] + random.shuffle(HSV_tuples) + return list( + map( + lambda x: (np.array(colorsys.hsv_to_rgb(*x)) * 255).astype( + np.uint8 + ), + HSV_tuples, + ) + ) + + +class RegularCheckpointing(pl.Callback): + def on_train_epoch_end( + self, trainer: "pl.Trainer", pl_module: "pl.LightningModule" + ): + general = pl_module.config.general + trainer.save_checkpoint(f"{general.save_dir}/last-epoch.ckpt") + print("Checkpoint created") + + +class InstanceSegmentation(pl.LightningModule): + def __init__(self, config): + super().__init__() + + self.decoder_id = config.general.decoder_id + + if config.model.train_on_segments: + self.mask_type = "segment_mask" + else: + self.mask_type = "masks" + + self.eval_on_segments = config.general.eval_on_segments + + self.config = config + self.save_hyperparameters() + # model + self.model = hydra.utils.instantiate(config.model) + self.optional_freeze = nullcontext + if config.general.freeze_backbone: + self.optional_freeze = torch.no_grad + # loss + self.ignore_label = config.data.ignore_label + + matcher = hydra.utils.instantiate(config.matcher) + weight_dict = { + "loss_ce": matcher.cost_class, + "loss_mask": matcher.cost_mask, + "loss_dice": matcher.cost_dice, + } + + aux_weight_dict = {} + for i in range(self.model.num_levels * self.model.num_decoders): + if i not in self.config.general.ignore_mask_idx: + aux_weight_dict.update( + {k + f"_{i}": v for k, v in weight_dict.items()} + ) + else: + aux_weight_dict.update( + {k + f"_{i}": 0.0 for k, v in weight_dict.items()} + ) + weight_dict.update(aux_weight_dict) + + self.preds = dict() + self.bbox_preds = dict() + self.bbox_gt = dict() + + self.criterion = hydra.utils.instantiate( + config.loss, matcher=matcher, weight_dict=weight_dict + ) + + # metrics + self.confusion = hydra.utils.instantiate(config.metrics) + self.iou = IoU() + # misc + self.labels_info = dict() + + def forward( + self, x, point2segment=None, raw_coordinates=None, is_eval=False + ): + with self.optional_freeze(): + x = self.model( + x, + point2segment, + raw_coordinates=raw_coordinates, + is_eval=is_eval, + ) + return x + + def training_step(self, batch, batch_idx): + data, target, file_names = batch + + if data.features.shape[0] > self.config.general.max_batch_size: + print("data exceeds threshold") + raise RuntimeError("BATCH TOO BIG") + + if len(target) == 0: + print("no targets") + return None + + raw_coordinates = None + if self.config.data.add_raw_coordinates: + raw_coordinates = data.features[:, -3:] + data.features = data.features[:, :-3] + + data = ME.SparseTensor( + coordinates=data.coordinates, + features=data.features, + device=self.device, + ) + + try: + output = self.forward( + data, + point2segment=[ + target[i]["point2segment"] for i in range(len(target)) + ], + raw_coordinates=raw_coordinates, + ) + except RuntimeError as run_err: + print(run_err) + if ( + "only a single point gives nans in cross-attention" + == run_err.args[0] + ): + return None + else: + raise run_err + + try: + losses = self.criterion(output, target, mask_type=self.mask_type) + except ValueError as val_err: + print(f"ValueError: {val_err}") + print(f"data shape: {data.shape}") + print(f"data feat shape: {data.features.shape}") + print(f"data feat nans: {data.features.isnan().sum()}") + print(f"output: {output}") + print(f"target: {target}") + print(f"filenames: {file_names}") + raise val_err + + for k in list(losses.keys()): + if k in self.criterion.weight_dict: + losses[k] *= self.criterion.weight_dict[k] + else: + # remove this loss if not specified in `weight_dict` + losses.pop(k) + + logs = { + f"train_{k}": v.detach().cpu().item() for k, v in losses.items() + } + + logs["train_mean_loss_ce"] = statistics.mean( + [item for item in [v for k, v in logs.items() if "loss_ce" in k]] + ) + + logs["train_mean_loss_mask"] = statistics.mean( + [item for item in [v for k, v in logs.items() if "loss_mask" in k]] + ) + + logs["train_mean_loss_dice"] = statistics.mean( + [item for item in [v for k, v in logs.items() if "loss_dice" in k]] + ) + + self.log_dict(logs) + return sum(losses.values()) + + def validation_step(self, batch, batch_idx): + return self.eval_step(batch, batch_idx) + + def export(self, pred_masks, scores, pred_classes, file_names, decoder_id): + root_path = f"eval_output" + base_path = f"{root_path}/instance_evaluation_{self.config.general.experiment_name}_{self.current_epoch}/decoder_{decoder_id}" + pred_mask_path = f"{base_path}/pred_mask" + + Path(pred_mask_path).mkdir(parents=True, exist_ok=True) + + file_name = file_names + with open(f"{base_path}/{file_name}.txt", "w") as fout: + real_id = -1 + for instance_id in range(len(pred_classes)): + real_id += 1 + pred_class = pred_classes[instance_id] + score = scores[instance_id] + mask = pred_masks[:, instance_id].astype("uint8") + + if score > self.config.general.export_threshold: + # reduce the export size a bit. I guess no performance difference + np.savetxt( + f"{pred_mask_path}/{file_name}_{real_id}.txt", + mask, + fmt="%d", + ) + fout.write( + f"pred_mask/{file_name}_{real_id}.txt {pred_class} {score}\n" + ) + + def training_epoch_end(self, outputs): + train_loss = sum([out["loss"].cpu().item() for out in outputs]) / len( + outputs + ) + results = {"train_loss_mean": train_loss} + self.log_dict(results) + + def validation_epoch_end(self, outputs): + self.test_epoch_end(outputs) + + def save_visualizations( + self, + target_full, + full_res_coords, + sorted_masks, + sort_classes, + file_name, + original_colors, + original_normals, + sort_scores_values, + point_size=20, + sorted_heatmaps=None, + query_pos=None, + backbone_features=None, + ): + + full_res_coords -= full_res_coords.mean(axis=0) + + gt_pcd_pos = [] + gt_pcd_normals = [] + gt_pcd_color = [] + gt_inst_pcd_color = [] + gt_boxes = [] + + if "labels" in target_full: + instances_colors = torch.from_numpy( + np.vstack( + get_evenly_distributed_colors( + target_full["labels"].shape[0] + ) + ) + ) + for instance_counter, (label, mask) in enumerate( + zip(target_full["labels"], target_full["masks"]) + ): + if label == 255: + continue + + mask_tmp = mask.detach().cpu().numpy() + mask_coords = full_res_coords[mask_tmp.astype(bool), :] + + if len(mask_coords) == 0: + continue + + gt_pcd_pos.append(mask_coords) + mask_coords_min = full_res_coords[ + mask_tmp.astype(bool), : + ].min(axis=0) + mask_coords_max = full_res_coords[ + mask_tmp.astype(bool), : + ].max(axis=0) + size = mask_coords_max - mask_coords_min + mask_coords_middle = mask_coords_min + size / 2 + + gt_boxes.append( + { + "position": mask_coords_middle, + "size": size, + "color": self.validation_dataset.map2color([label])[0], + } + ) + + gt_pcd_color.append( + self.validation_dataset.map2color([label]).repeat( + gt_pcd_pos[-1].shape[0], 1 + ) + ) + gt_inst_pcd_color.append( + instances_colors[instance_counter % len(instances_colors)] + .unsqueeze(0) + .repeat(gt_pcd_pos[-1].shape[0], 1) + ) + + gt_pcd_normals.append( + original_normals[mask_tmp.astype(bool), :] + ) + + gt_pcd_pos = np.concatenate(gt_pcd_pos) + gt_pcd_normals = np.concatenate(gt_pcd_normals) + gt_pcd_color = np.concatenate(gt_pcd_color) + gt_inst_pcd_color = np.concatenate(gt_inst_pcd_color) + + v = vis.Visualizer() + + v.add_points( + "RGB Input", + full_res_coords, + colors=original_colors, + normals=original_normals, + visible=True, + point_size=point_size, + ) + + if backbone_features is not None: + v.add_points( + "PCA", + full_res_coords, + colors=backbone_features, + normals=original_normals, + visible=False, + point_size=point_size, + ) + + if "labels" in target_full: + v.add_points( + "Semantics (GT)", + gt_pcd_pos, + colors=gt_pcd_color, + normals=gt_pcd_normals, + alpha=0.8, + visible=False, + point_size=point_size, + ) + v.add_points( + "Instances (GT)", + gt_pcd_pos, + colors=gt_inst_pcd_color, + normals=gt_pcd_normals, + alpha=0.8, + visible=False, + point_size=point_size, + ) + + pred_coords = [] + pred_normals = [] + pred_sem_color = [] + pred_inst_color = [] + + for did in range(len(sorted_masks)): + instances_colors = torch.from_numpy( + np.vstack( + get_evenly_distributed_colors( + max(1, sorted_masks[did].shape[1]) + ) + ) + ) + + for i in reversed(range(sorted_masks[did].shape[1])): + coords = full_res_coords[ + sorted_masks[did][:, i].astype(bool), : + ] + + mask_coords = full_res_coords[ + sorted_masks[did][:, i].astype(bool), : + ] + mask_normals = original_normals[ + sorted_masks[did][:, i].astype(bool), : + ] + + label = sort_classes[did][i] + + if len(mask_coords) == 0: + continue + + pred_coords.append(mask_coords) + pred_normals.append(mask_normals) + + pred_sem_color.append( + self.validation_dataset.map2color([label]).repeat( + mask_coords.shape[0], 1 + ) + ) + + pred_inst_color.append( + instances_colors[i % len(instances_colors)] + .unsqueeze(0) + .repeat(mask_coords.shape[0], 1) + ) + + if len(pred_coords) > 0: + pred_coords = np.concatenate(pred_coords) + pred_normals = np.concatenate(pred_normals) + pred_sem_color = np.concatenate(pred_sem_color) + pred_inst_color = np.concatenate(pred_inst_color) + + v.add_points( + "Semantics (Mask3D)", + pred_coords, + colors=pred_sem_color, + normals=pred_normals, + visible=False, + alpha=0.8, + point_size=point_size, + ) + v.add_points( + "Instances (Mask3D)", + pred_coords, + colors=pred_inst_color, + normals=pred_normals, + visible=False, + alpha=0.8, + point_size=point_size, + ) + + v.save( + f"{self.config['general']['save_dir']}/visualizations/{file_name}" + ) + + def eval_step(self, batch, batch_idx): + data, target, file_names = batch + inverse_maps = data.inverse_maps + target_full = data.target_full + original_colors = data.original_colors + data_idx = data.idx + original_normals = data.original_normals + original_coordinates = data.original_coordinates + + # if len(target) == 0 or len(target_full) == 0: + # print("no targets") + # return None + + if len(data.coordinates) == 0: + return 0.0 + + raw_coordinates = None + if self.config.data.add_raw_coordinates: + raw_coordinates = data.features[:, -3:] + data.features = data.features[:, :-3] + + if raw_coordinates.shape[0] == 0: + return 0.0 + + data = ME.SparseTensor( + coordinates=data.coordinates, + features=data.features, + device=self.device, + ) + + try: + output = self.forward( + data, + point2segment=[ + target[i]["point2segment"] for i in range(len(target)) + ], + raw_coordinates=raw_coordinates, + is_eval=True, + ) + except RuntimeError as run_err: + print(run_err) + if ( + "only a single point gives nans in cross-attention" + == run_err.args[0] + ): + return None + else: + raise run_err + + if self.config.data.test_mode != "test": + if self.config.trainer.deterministic: + torch.use_deterministic_algorithms(False) + + try: + losses = self.criterion( + output, target, mask_type=self.mask_type + ) + except ValueError as val_err: + print(f"ValueError: {val_err}") + print(f"data shape: {data.shape}") + print(f"data feat shape: {data.features.shape}") + print(f"data feat nans: {data.features.isnan().sum()}") + print(f"output: {output}") + print(f"target: {target}") + print(f"filenames: {file_names}") + raise val_err + + for k in list(losses.keys()): + if k in self.criterion.weight_dict: + losses[k] *= self.criterion.weight_dict[k] + else: + # remove this loss if not specified in `weight_dict` + losses.pop(k) + if self.config.trainer.deterministic: + torch.use_deterministic_algorithms(True) + + if self.config.general.save_visualizations: + backbone_features = ( + output["backbone_features"].F.detach().cpu().numpy() + ) + from sklearn import decomposition + + pca = decomposition.PCA(n_components=3) + pca.fit(backbone_features) + pca_features = pca.transform(backbone_features) + rescaled_pca = ( + 255 + * (pca_features - pca_features.min()) + / (pca_features.max() - pca_features.min()) + ) + + self.eval_instance_step( + output, + target, + target_full, + inverse_maps, + file_names, + original_coordinates, + original_colors, + original_normals, + raw_coordinates, + data_idx, + backbone_features=rescaled_pca + if self.config.general.save_visualizations + else None, + ) + + if self.config.data.test_mode != "test": + return { + f"val_{k}": v.detach().cpu().item() for k, v in losses.items() + } + else: + return 0.0 + + def test_step(self, batch, batch_idx): + return self.eval_step(batch, batch_idx) + + def get_full_res_mask( + self, mask, inverse_map, point2segment_full, is_heatmap=False + ): + mask = mask.detach().cpu()[inverse_map] # full res + + if self.eval_on_segments and is_heatmap == False: + mask = scatter_mean( + mask, point2segment_full, dim=0 + ) # full res segments + mask = (mask > 0.5).float() + mask = mask.detach().cpu()[ + point2segment_full.cpu() + ] # full res points + + return mask + + def get_mask_and_scores( + self, mask_cls, mask_pred, num_queries=100, num_classes=18, device=None + ): + if device is None: + device = self.device + labels = ( + torch.arange(num_classes, device=device) + .unsqueeze(0) + .repeat(num_queries, 1) + .flatten(0, 1) + ) + + if self.config.general.topk_per_image != -1: + scores_per_query, topk_indices = mask_cls.flatten(0, 1).topk( + self.config.general.topk_per_image, sorted=True + ) + else: + scores_per_query, topk_indices = mask_cls.flatten(0, 1).topk( + num_queries, sorted=True + ) + + labels_per_query = labels[topk_indices] + topk_indices = topk_indices // num_classes + mask_pred = mask_pred[:, topk_indices] + + result_pred_mask = (mask_pred > 0).float() + heatmap = mask_pred.float().sigmoid() + + mask_scores_per_image = (heatmap * result_pred_mask).sum(0) / ( + result_pred_mask.sum(0) + 1e-6 + ) + score = scores_per_query * mask_scores_per_image + classes = labels_per_query + + return score, result_pred_mask, classes, heatmap + + def eval_instance_step( + self, + output, + target_low_res, + target_full_res, + inverse_maps, + file_names, + full_res_coords, + original_colors, + original_normals, + raw_coords, + idx, + first_full_res=False, + backbone_features=None, + ): + label_offset = self.validation_dataset.label_offset + prediction = output["aux_outputs"] + prediction.append( + { + "pred_logits": output["pred_logits"], + "pred_masks": output["pred_masks"], + } + ) + + prediction[self.decoder_id][ + "pred_logits" + ] = torch.functional.F.softmax( + prediction[self.decoder_id]["pred_logits"], dim=-1 + )[ + ..., :-1 + ] + + all_pred_classes = list() + all_pred_masks = list() + all_pred_scores = list() + all_heatmaps = list() + all_query_pos = list() + + offset_coords_idx = 0 + for bid in range(len(prediction[self.decoder_id]["pred_masks"])): + if not first_full_res: + if self.model.train_on_segments: + masks = ( + prediction[self.decoder_id]["pred_masks"][bid] + .detach() + .cpu()[target_low_res[bid]["point2segment"].cpu()] + ) + else: + masks = ( + prediction[self.decoder_id]["pred_masks"][bid] + .detach() + .cpu() + ) + + if self.config.general.use_dbscan: + new_preds = { + "pred_masks": list(), + "pred_logits": list(), + } + + curr_coords_idx = masks.shape[0] + curr_coords = raw_coords[ + offset_coords_idx : curr_coords_idx + offset_coords_idx + ] + offset_coords_idx += curr_coords_idx + + for curr_query in range(masks.shape[1]): + curr_masks = masks[:, curr_query] > 0 + + if curr_coords[curr_masks].shape[0] > 0: + clusters = ( + DBSCAN( + eps=self.config.general.dbscan_eps, + min_samples=self.config.general.dbscan_min_points, + n_jobs=-1, + ) + .fit(curr_coords[curr_masks]) + .labels_ + ) + + new_mask = torch.zeros(curr_masks.shape, dtype=int) + new_mask[curr_masks] = ( + torch.from_numpy(clusters) + 1 + ) + + for cluster_id in np.unique(clusters): + original_pred_masks = masks[:, curr_query] + if cluster_id != -1: + new_preds["pred_masks"].append( + original_pred_masks + * (new_mask == cluster_id + 1) + ) + new_preds["pred_logits"].append( + prediction[self.decoder_id][ + "pred_logits" + ][bid, curr_query] + ) + + scores, masks, classes, heatmap = self.get_mask_and_scores( + torch.stack(new_preds["pred_logits"]).cpu(), + torch.stack(new_preds["pred_masks"]).T, + len(new_preds["pred_logits"]), + self.model.num_classes - 1, + ) + else: + scores, masks, classes, heatmap = self.get_mask_and_scores( + prediction[self.decoder_id]["pred_logits"][bid] + .detach() + .cpu(), + masks, + prediction[self.decoder_id]["pred_logits"][bid].shape[ + 0 + ], + self.model.num_classes - 1, + ) + + masks = self.get_full_res_mask( + masks, + inverse_maps[bid], + target_full_res[bid]["point2segment"], + ) + + heatmap = self.get_full_res_mask( + heatmap, + inverse_maps[bid], + target_full_res[bid]["point2segment"], + is_heatmap=True, + ) + + if backbone_features is not None: + backbone_features = self.get_full_res_mask( + torch.from_numpy(backbone_features), + inverse_maps[bid], + target_full_res[bid]["point2segment"], + is_heatmap=True, + ) + backbone_features = backbone_features.numpy() + else: + assert False, "not tested" + masks = self.get_full_res_mask( + prediction[self.decoder_id]["pred_masks"][bid].cpu(), + inverse_maps[bid], + target_full_res[bid]["point2segment"], + ) + + scores, masks, classes, heatmap = self.get_mask_and_scores( + prediction[self.decoder_id]["pred_logits"][bid].cpu(), + masks, + prediction[self.decoder_id]["pred_logits"][bid].shape[0], + self.model.num_classes - 1, + device="cpu", + ) + + masks = masks.numpy() + heatmap = heatmap.numpy() + + sort_scores = scores.sort(descending=True) + sort_scores_index = sort_scores.indices.cpu().numpy() + sort_scores_values = sort_scores.values.cpu().numpy() + sort_classes = classes[sort_scores_index] + + sorted_masks = masks[:, sort_scores_index] + sorted_heatmap = heatmap[:, sort_scores_index] + + if self.config.general.filter_out_instances: + keep_instances = set() + pairwise_overlap = sorted_masks.T @ sorted_masks + normalization = pairwise_overlap.max(axis=0) + norm_overlaps = pairwise_overlap / normalization + + for instance_id in range(norm_overlaps.shape[0]): + # filter out unlikely masks and nearly empty masks + # if not(sort_scores_values[instance_id] < 0.3 or sorted_masks[:, instance_id].sum() < 500): + if not ( + sort_scores_values[instance_id] + < self.config.general.scores_threshold + ): + # check if mask != empty + if not sorted_masks[:, instance_id].sum() == 0.0: + overlap_ids = set( + np.nonzero( + norm_overlaps[instance_id, :] + > self.config.general.iou_threshold + )[0] + ) + + if len(overlap_ids) == 0: + keep_instances.add(instance_id) + else: + if instance_id == min(overlap_ids): + keep_instances.add(instance_id) + + keep_instances = sorted(list(keep_instances)) + all_pred_classes.append(sort_classes[keep_instances]) + all_pred_masks.append(sorted_masks[:, keep_instances]) + all_pred_scores.append(sort_scores_values[keep_instances]) + all_heatmaps.append(sorted_heatmap[:, keep_instances]) + else: + all_pred_classes.append(sort_classes) + all_pred_masks.append(sorted_masks) + all_pred_scores.append(sort_scores_values) + all_heatmaps.append(sorted_heatmap) + + if self.validation_dataset.dataset_name == "scannet200": + all_pred_classes[bid][all_pred_classes[bid] == 0] = -1 + if self.config.data.test_mode != "test": + target_full_res[bid]["labels"][ + target_full_res[bid]["labels"] == 0 + ] = -1 + + for bid in range(len(prediction[self.decoder_id]["pred_masks"])): + all_pred_classes[ + bid + ] = self.validation_dataset._remap_model_output( + all_pred_classes[bid].cpu() + label_offset + ) + + if ( + self.config.data.test_mode != "test" + and len(target_full_res) != 0 + ): + target_full_res[bid][ + "labels" + ] = self.validation_dataset._remap_model_output( + target_full_res[bid]["labels"].cpu() + label_offset + ) + + # PREDICTION BOX + bbox_data = [] + for query_id in range( + all_pred_masks[bid].shape[1] + ): # self.model.num_queries + obj_coords = full_res_coords[bid][ + all_pred_masks[bid][:, query_id].astype(bool), : + ] + if obj_coords.shape[0] > 0: + obj_center = obj_coords.mean(axis=0) + obj_axis_length = obj_coords.max( + axis=0 + ) - obj_coords.min(axis=0) + + bbox = np.concatenate((obj_center, obj_axis_length)) + + bbox_data.append( + ( + all_pred_classes[bid][query_id].item(), + bbox, + all_pred_scores[bid][query_id], + ) + ) + self.bbox_preds[file_names[bid]] = bbox_data + + # GT BOX + bbox_data = [] + for obj_id in range(target_full_res[bid]["masks"].shape[0]): + if target_full_res[bid]["labels"][obj_id].item() == 255: + continue + + obj_coords = full_res_coords[bid][ + target_full_res[bid]["masks"][obj_id, :] + .cpu() + .detach() + .numpy() + .astype(bool), + :, + ] + if obj_coords.shape[0] > 0: + obj_center = obj_coords.mean(axis=0) + obj_axis_length = obj_coords.max( + axis=0 + ) - obj_coords.min(axis=0) + + bbox = np.concatenate((obj_center, obj_axis_length)) + bbox_data.append( + ( + target_full_res[bid]["labels"][obj_id].item(), + bbox, + ) + ) + + self.bbox_gt[file_names[bid]] = bbox_data + + if self.config.general.eval_inner_core == -1: + self.preds[file_names[bid]] = { + "pred_masks": all_pred_masks[bid], + "pred_scores": all_pred_scores[bid], + "pred_classes": all_pred_classes[bid], + } + else: + # prev val_dataset + self.preds[file_names[bid]] = { + "pred_masks": all_pred_masks[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + "pred_scores": all_pred_scores[bid], + "pred_classes": all_pred_classes[bid], + } + + if self.config.general.save_visualizations: + if "cond_inner" in self.test_dataset.data[idx[bid]]: + target_full_res[bid]["masks"] = target_full_res[bid][ + "masks" + ][:, self.test_dataset.data[idx[bid]]["cond_inner"]] + self.save_visualizations( + target_full_res[bid], + full_res_coords[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + [self.preds[file_names[bid]]["pred_masks"]], + [self.preds[file_names[bid]]["pred_classes"]], + file_names[bid], + original_colors[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + original_normals[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + [self.preds[file_names[bid]]["pred_scores"]], + sorted_heatmaps=[ + all_heatmaps[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ] + ], + query_pos=all_query_pos[bid][ + self.test_dataset.data[idx[bid]]["cond_inner"] + ] + if len(all_query_pos) > 0 + else None, + backbone_features=backbone_features[ + self.test_dataset.data[idx[bid]]["cond_inner"] + ], + point_size=self.config.general.visualization_point_size, + ) + else: + self.save_visualizations( + target_full_res[bid], + full_res_coords[bid], + [self.preds[file_names[bid]]["pred_masks"]], + [self.preds[file_names[bid]]["pred_classes"]], + file_names[bid], + original_colors[bid], + original_normals[bid], + [self.preds[file_names[bid]]["pred_scores"]], + sorted_heatmaps=[all_heatmaps[bid]], + query_pos=all_query_pos[bid] + if len(all_query_pos) > 0 + else None, + backbone_features=backbone_features, + point_size=self.config.general.visualization_point_size, + ) + + if self.config.general.export: + if self.validation_dataset.dataset_name == "stpls3d": + scan_id, _, _, crop_id = file_names[bid].split("_") + crop_id = int(crop_id.replace(".txt", "")) + file_name = ( + f"{scan_id}_points_GTv3_0{crop_id}_inst_nostuff" + ) + + self.export( + self.preds[file_names[bid]]["pred_masks"], + self.preds[file_names[bid]]["pred_scores"], + self.preds[file_names[bid]]["pred_classes"], + file_name, + self.decoder_id, + ) + else: + self.export( + self.preds[file_names[bid]]["pred_masks"], + self.preds[file_names[bid]]["pred_scores"], + self.preds[file_names[bid]]["pred_classes"], + file_names[bid], + self.decoder_id, + ) + + def eval_instance_epoch_end(self): + log_prefix = f"val" + ap_results = {} + + head_results, tail_results, common_results = [], [], [] + + box_ap_50 = eval_det( + self.bbox_preds, self.bbox_gt, ovthresh=0.5, use_07_metric=False + ) + box_ap_25 = eval_det( + self.bbox_preds, self.bbox_gt, ovthresh=0.25, use_07_metric=False + ) + mean_box_ap_25 = sum([v for k, v in box_ap_25[-1].items()]) / len( + box_ap_25[-1].keys() + ) + mean_box_ap_50 = sum([v for k, v in box_ap_50[-1].items()]) / len( + box_ap_50[-1].keys() + ) + + ap_results[f"{log_prefix}_mean_box_ap_25"] = mean_box_ap_25 + ap_results[f"{log_prefix}_mean_box_ap_50"] = mean_box_ap_50 + + for class_id in box_ap_50[-1].keys(): + class_name = self.train_dataset.label_info[class_id]["name"] + ap_results[f"{log_prefix}_{class_name}_val_box_ap_50"] = box_ap_50[ + -1 + ][class_id] + + for class_id in box_ap_25[-1].keys(): + class_name = self.train_dataset.label_info[class_id]["name"] + ap_results[f"{log_prefix}_{class_name}_val_box_ap_25"] = box_ap_25[ + -1 + ][class_id] + + root_path = f"eval_output" + base_path = f"{root_path}/instance_evaluation_{self.config.general.experiment_name}_{self.current_epoch}" + + if self.validation_dataset.dataset_name in [ + "scannet", + "stpls3d", + "scannet200", + ]: + gt_data_path = f"{self.validation_dataset.data_dir[0]}/instance_gt/{self.validation_dataset.mode}" + else: + gt_data_path = f"{self.validation_dataset.data_dir[0]}/instance_gt/Area_{self.config.general.area}" + + pred_path = f"{base_path}/tmp_output.txt" + + log_prefix = f"val" + + if not os.path.exists(base_path): + os.makedirs(base_path) + + try: + if self.validation_dataset.dataset_name == "s3dis": + new_preds = {} + for key in self.preds.keys(): + new_preds[ + key.replace(f"Area_{self.config.general.area}_", "") + ] = { + "pred_classes": self.preds[key]["pred_classes"] + 1, + "pred_masks": self.preds[key]["pred_masks"], + "pred_scores": self.preds[key]["pred_scores"], + } + mprec, mrec = evaluate( + new_preds, gt_data_path, pred_path, dataset="s3dis" + ) + ap_results[f"{log_prefix}_mean_precision"] = mprec + ap_results[f"{log_prefix}_mean_recall"] = mrec + elif self.validation_dataset.dataset_name == "stpls3d": + new_preds = {} + for key in self.preds.keys(): + new_preds[key.replace(".txt", "")] = { + "pred_classes": self.preds[key]["pred_classes"], + "pred_masks": self.preds[key]["pred_masks"], + "pred_scores": self.preds[key]["pred_scores"], + } + + evaluate(new_preds, gt_data_path, pred_path, dataset="stpls3d") + else: + evaluate( + self.preds, + gt_data_path, + pred_path, + dataset=self.validation_dataset.dataset_name, + ) + with open(pred_path, "r") as fin: + for line_id, line in enumerate(fin): + if line_id == 0: + # ignore header + continue + class_name, _, ap, ap_50, ap_25 = line.strip().split(",") + + if self.validation_dataset.dataset_name == "scannet200": + if class_name in VALID_CLASS_IDS_200_VALIDATION: + ap_results[ + f"{log_prefix}_{class_name}_val_ap" + ] = float(ap) + ap_results[ + f"{log_prefix}_{class_name}_val_ap_50" + ] = float(ap_50) + ap_results[ + f"{log_prefix}_{class_name}_val_ap_25" + ] = float(ap_25) + + if class_name in HEAD_CATS_SCANNET_200: + head_results.append( + np.array( + (float(ap), float(ap_50), float(ap_25)) + ) + ) + elif class_name in COMMON_CATS_SCANNET_200: + common_results.append( + np.array( + (float(ap), float(ap_50), float(ap_25)) + ) + ) + elif class_name in TAIL_CATS_SCANNET_200: + tail_results.append( + np.array( + (float(ap), float(ap_50), float(ap_25)) + ) + ) + else: + assert (False, "class not known!") + else: + ap_results[ + f"{log_prefix}_{class_name}_val_ap" + ] = float(ap) + ap_results[ + f"{log_prefix}_{class_name}_val_ap_50" + ] = float(ap_50) + ap_results[ + f"{log_prefix}_{class_name}_val_ap_25" + ] = float(ap_25) + + if self.validation_dataset.dataset_name == "scannet200": + head_results = np.stack(head_results) + common_results = np.stack(common_results) + tail_results = np.stack(tail_results) + + mean_tail_results = np.nanmean(tail_results, axis=0) + mean_common_results = np.nanmean(common_results, axis=0) + mean_head_results = np.nanmean(head_results, axis=0) + + ap_results[ + f"{log_prefix}_mean_tail_ap_25" + ] = mean_tail_results[0] + ap_results[ + f"{log_prefix}_mean_common_ap_25" + ] = mean_common_results[0] + ap_results[ + f"{log_prefix}_mean_head_ap_25" + ] = mean_head_results[0] + + ap_results[ + f"{log_prefix}_mean_tail_ap_50" + ] = mean_tail_results[1] + ap_results[ + f"{log_prefix}_mean_common_ap_50" + ] = mean_common_results[1] + ap_results[ + f"{log_prefix}_mean_head_ap_50" + ] = mean_head_results[1] + + ap_results[ + f"{log_prefix}_mean_tail_ap_25" + ] = mean_tail_results[2] + ap_results[ + f"{log_prefix}_mean_common_ap_25" + ] = mean_common_results[2] + ap_results[ + f"{log_prefix}_mean_head_ap_25" + ] = mean_head_results[2] + + overall_ap_results = np.nanmean( + np.vstack((head_results, common_results, tail_results)), + axis=0, + ) + + ap_results[f"{log_prefix}_mean_ap"] = overall_ap_results[0] + ap_results[f"{log_prefix}_mean_ap_50"] = overall_ap_results[1] + ap_results[f"{log_prefix}_mean_ap_25"] = overall_ap_results[2] + + ap_results = { + key: 0.0 if math.isnan(score) else score + for key, score in ap_results.items() + } + else: + mean_ap = statistics.mean( + [ + item + for key, item in ap_results.items() + if key.endswith("val_ap") + ] + ) + mean_ap_50 = statistics.mean( + [ + item + for key, item in ap_results.items() + if key.endswith("val_ap_50") + ] + ) + mean_ap_25 = statistics.mean( + [ + item + for key, item in ap_results.items() + if key.endswith("val_ap_25") + ] + ) + + ap_results[f"{log_prefix}_mean_ap"] = mean_ap + ap_results[f"{log_prefix}_mean_ap_50"] = mean_ap_50 + ap_results[f"{log_prefix}_mean_ap_25"] = mean_ap_25 + + ap_results = { + key: 0.0 if math.isnan(score) else score + for key, score in ap_results.items() + } + except (IndexError, OSError) as e: + print("NO SCORES!!!") + ap_results[f"{log_prefix}_mean_ap"] = 0.0 + ap_results[f"{log_prefix}_mean_ap_50"] = 0.0 + ap_results[f"{log_prefix}_mean_ap_25"] = 0.0 + + self.log_dict(ap_results) + + if not self.config.general.export: + shutil.rmtree(base_path) + + del self.preds + del self.bbox_preds + del self.bbox_gt + + gc.collect() + + self.preds = dict() + self.bbox_preds = dict() + self.bbox_gt = dict() + + def test_epoch_end(self, outputs): + if self.config.general.export: + return + + self.eval_instance_epoch_end() + + dd = defaultdict(list) + for output in outputs: + for key, val in output.items(): # .items() in Python 3. + dd[key].append(val) + + dd = {k: statistics.mean(v) for k, v in dd.items()} + + dd["val_mean_loss_ce"] = statistics.mean( + [item for item in [v for k, v in dd.items() if "loss_ce" in k]] + ) + dd["val_mean_loss_mask"] = statistics.mean( + [item for item in [v for k, v in dd.items() if "loss_mask" in k]] + ) + dd["val_mean_loss_dice"] = statistics.mean( + [item for item in [v for k, v in dd.items() if "loss_dice" in k]] + ) + + self.log_dict(dd) + + def configure_optimizers(self): + optimizer = hydra.utils.instantiate( + self.config.optimizer, params=self.parameters() + ) + if "steps_per_epoch" in self.config.scheduler.scheduler.keys(): + self.config.scheduler.scheduler.steps_per_epoch = len( + self.train_dataloader() + ) + lr_scheduler = hydra.utils.instantiate( + self.config.scheduler.scheduler, optimizer=optimizer + ) + scheduler_config = {"scheduler": lr_scheduler} + scheduler_config.update(self.config.scheduler.pytorch_lightning_params) + return [optimizer], [scheduler_config] + + def prepare_data(self): + self.train_dataset = hydra.utils.instantiate( + self.config.data.train_dataset + ) + self.validation_dataset = hydra.utils.instantiate( + self.config.data.validation_dataset + ) + self.test_dataset = hydra.utils.instantiate( + self.config.data.test_dataset + ) + self.labels_info = self.train_dataset.label_info + + def train_dataloader(self): + c_fn = hydra.utils.instantiate(self.config.data.train_collation) + return hydra.utils.instantiate( + self.config.data.train_dataloader, + self.train_dataset, + collate_fn=c_fn, + ) + + def val_dataloader(self): + c_fn = hydra.utils.instantiate(self.config.data.validation_collation) + return hydra.utils.instantiate( + self.config.data.validation_dataloader, + self.validation_dataset, + collate_fn=c_fn, + ) + + def test_dataloader(self): + c_fn = hydra.utils.instantiate(self.config.data.test_collation) + return hydra.utils.instantiate( + self.config.data.test_dataloader, + self.test_dataset, + collate_fn=c_fn, + ) diff --git a/models/Mask3D/mask3d/utils/__init__.py b/models/Mask3D/mask3d/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/mask3d/utils/gradflow_check.py b/models/Mask3D/mask3d/utils/gradflow_check.py new file mode 100644 index 0000000000000000000000000000000000000000..2fedc91592d66d4e5bdef7531daafccc5b5f2e81 --- /dev/null +++ b/models/Mask3D/mask3d/utils/gradflow_check.py @@ -0,0 +1,62 @@ +""" https://github.com/alwynmathew/gradflow-check """ +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.lines import Line2D + + +def plot_grad_flow(named_parameters): + ave_grads = [] + layers = [] + for n, p in named_parameters: + if (p.requires_grad) and ("bias" not in n): + if p.grad: + layers.append(n) + ave_grads.append(p.grad.abs().mean()) + else: + print(f"{n} - doesn't have gradient computed") + + plt.plot(ave_grads, alpha=0.3, color="b") + plt.hlines(0, 0, len(ave_grads) + 1, linewidth=1, color="k") + plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical") + plt.xlim(xmin=0, xmax=len(ave_grads)) + plt.xlabel("Layers") + plt.ylabel("average gradient") + plt.title("Gradient flow") + plt.grid(True) + + +def plot_grad_flow_v2(named_parameters): + """Plots the gradients flowing through different layers in the net during training. + Can be used for checking for possible gradient vanishing / exploding problems. + + Usage: Plug this function in Trainer class after loss.backwards() as + "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow""" + ave_grads = [] + max_grads = [] + layers = [] + for n, p in named_parameters: + if (p.requires_grad) and ("bias" not in n): + layers.append(n) + if p.grad: + ave_grads.append(p.grad.abs().mean()) + max_grads.append(p.grad.abs().max()) + else: + print(f"{n} - doesn't have gradient computed") + plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c") + plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b") + plt.hlines(0, 0, len(ave_grads) + 1, lw=2, color="k") + plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical") + plt.xlim(left=0, right=len(ave_grads)) + plt.ylim(bottom=-0.001, top=0.02) # zoom in on the lower gradient regions + plt.xlabel("Layers") + plt.ylabel("average gradient") + plt.title("Gradient flow") + plt.grid(True) + plt.legend( + [ + Line2D([0], [0], color="c", lw=4), + Line2D([0], [0], color="b", lw=4), + Line2D([0], [0], color="k", lw=4), + ], + ["max-gradient", "mean-gradient", "zero-gradient"], + ) diff --git a/models/Mask3D/mask3d/utils/kfold.py b/models/Mask3D/mask3d/utils/kfold.py new file mode 100644 index 0000000000000000000000000000000000000000..5bfeba130c890eec35530adeb23f1362041f7cdc --- /dev/null +++ b/models/Mask3D/mask3d/utils/kfold.py @@ -0,0 +1,89 @@ +""" Author: https://github.com/yk-szk/stratified_group_kfold """ +import random +import numpy as np + + +class StratifiedGroupKFold: + """ + Stratified Group K-fold with sklearn.model_selection.KFold compabitility. + + Split dataset into k folds with balanced label distribution (stratified) and non-overlapping group. + + Args: + n_splits (int): # of splits + shuffle (bool): Shuffle + seed (int): Seed value for random number generator + """ + + def __init__(self, n_splits, shuffle=True, random_state=None): + self.n_splits = n_splits + self.shuffle = shuffle + self.seed = random_state + + def split(self, X, labels, groups): + assert len(X) == len(labels) == len(groups), "Invalid input length" + assert ( + len(set(groups)) >= self.n_splits + ), "The number of groups needs to be larger than n_splits" + + def encode(v): + s = set(v) + d = {l: i for i, l in enumerate(s)} + return [d[e] for e in v] + + labels, groups = encode(labels), encode(groups) + num_labels, num_groups = max(labels) + 1, max(groups) + 1 + label_counts_per_group = np.zeros((num_groups, num_labels), dtype=int) + global_label_dist = np.bincount(labels) + for label, g in zip(labels, groups): + label_counts_per_group[g][label] += 1 + + label_counts_per_fold = np.zeros( + (self.n_splits, num_labels), dtype=int + ) + groups_per_fold = [set() for _ in range(self.n_splits)] + + def eval_label_counts_per_fold(y_counts, fold): + fold += y_counts + std_per_label = ( + np.std(label_counts_per_fold, axis=0) / global_label_dist + ) + fold -= y_counts + return np.mean(std_per_label) + + groups_and_label_counts = list(enumerate(label_counts_per_group)) + if self.shuffle: + rng = random.Random(self.seed) + mean_std = np.mean(np.std(label_counts_per_group, axis=1)) + groups_and_label_counts.sort( + key=lambda g_counts: -np.std(g_counts[1]) + + rng.gauss(0, mean_std) + ) # add rng.gauss to increase the randomness + else: + groups_and_label_counts.sort( + key=lambda g_counts: -np.std(g_counts[1]) + ) + + for g, label_counts in groups_and_label_counts: + evals = [ + eval_label_counts_per_fold( + label_counts, label_counts_per_fold[i] + ) + for i in range(self.n_splits) + ] + best_fold = np.argmin(evals) + label_counts_per_fold[best_fold] += label_counts + groups_per_fold[best_fold].add(g) + + all_groups = set(groups) + for test_groups in groups_per_fold: + train_groups = all_groups - test_groups + + train_indices = [ + i for i, g in enumerate(groups) if g in train_groups + ] + test_indices = [ + i for i, g in enumerate(groups) if g in test_groups + ] + + yield train_indices, test_indices diff --git a/models/Mask3D/mask3d/utils/pc_visualizations.py b/models/Mask3D/mask3d/utils/pc_visualizations.py new file mode 100644 index 0000000000000000000000000000000000000000..26937b9f293f9cc2b87cc67d3c8742c80f770d60 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pc_visualizations.py @@ -0,0 +1,202 @@ +from io import BytesIO +from imageio import imread + +import open3d as o3d +from PIL import Image +import numpy as np +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from pandas import DataFrame +import matplotlib +import seaborn as sns +import pyviz3d.visualizer as viz + +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +def point_cloud_plolty( + coordinates, + label_color, + label_text, + prediction_color, + prediction_text, + normals, +): + def draw_point_cloud(coords, colors=None, label_text=None): + marker = dict(size=1, opacity=0.8) + if colors is not None: + marker.update({"color": colors}) + if (colors is None) and (label_text is not None): + marker.update({"color": label_text}) + fig = go.Scatter3d( + x=coords[:, 0], + y=coords[:, 1], + z=coords[:, 2], + text=label_text, + mode="markers", + marker=marker, + ) + return fig + + fig = make_subplots( + rows=1, + cols=2, + specs=[[{"type": "scatter3d"}, {"type": "scatter3d"}]], + ) + fig.add_trace( + draw_point_cloud(coordinates, prediction_color, prediction_text), + row=1, + col=1, + ) + # adding image with prediction + fig.add_trace( + draw_point_cloud(coordinates, label_color, label_text), row=1, col=2 + ) + fig.show() + # data = fig.to_image(width=1080, height=720, format="png") + # image = Image.open(BytesIO(data)) + # return image + + +def point_cloud_pyviz3d( + name, + coordinates, + path, + color=None, + normals=None, + label_color=None, + prediction_color=None, + point_size=25, + voxel_size=0.01, +): + + # because of visualization + coordinates = coordinates * voxel_size + # First, we set up a visualizer + visualizer = viz.Visualizer() + if label_color is not None: + visualizer.add_points( + name=f"{name}_label", + positions=coordinates, + colors=label_color, + point_size=point_size, + visible=False, + ) + + if prediction_color is not None: + visualizer.add_points( + name=f"{name}_prediction", + positions=coordinates, + colors=prediction_color, + point_size=point_size, + visible=False, + ) + + visualizer.add_points( + name=name, + positions=coordinates, + colors=color, + normals=normals, + point_size=point_size, + visible=False, + ) + # When we added everything we need to the visualizer, we save it. + visualizer.save(path, verbose=False) + + +def point_cloud_open3d(coordinates): + points = o3d.geometry.PointCloud(o3d.utility.Vector3dVector(coordinates)) + o3d.visualization.draw_geometries([points]) + + +def _remap_model_output(output, labels): + output = np.array(output) + output_remapped = output.copy() + for i, k in enumerate(labels.keys()): + output_remapped[output == i] = k + return output_remapped + + +def save_visualization( + coordinates, + name="none", + color=None, + normals=None, + target=None, + prediction=None, + target_info=None, + path="./saved", + backend="pyviz3d", + voxel_size=0.05, + color_mean=[0.47793125906962, 0.4303257521323044, 0.3749598901421883], + color_std=[0.2834475483823543, 0.27566157565723015, 0.27018971370874995], +): + target = _remap_model_output(target, target_info) + prediction = _remap_model_output(prediction, target_info) + coordinates = coordinates[:, :3] - coordinates[:, :3].mean(axis=0) + coordinates = coordinates * voxel_size + if color is not None: + color = (color * color_std + color_mean) * 255 + + target_color = np.zeros((len(target), 3)) + target_text = np.full((len(target)), "empty") + prediction_color = np.zeros((len(prediction), 3)) + prediction_text = np.full((len(prediction)), "empty") + if target_info is not None: + for k, v in target_info.items(): + target_color[target == k] = v["color"] + target_text[target == k] = v["name"] + prediction_color[prediction == k] = v["color"] + prediction_text[prediction == k] = v["name"] + if backend == "pyviz3d": + point_cloud_pyviz3d( + name=name, + coordinates=coordinates, + path=path, + color=color, + normals=normals, + label_color=target_color, + prediction_color=prediction_color, + voxel_size=1, + ) + elif backend == "plotly": + point_cloud_plolty( + coordinates=coordinates, + normals=normals, + label_color=target_color, + label_text=target_text, + prediction_color=prediction_color, + prediction_text=prediction_text, + ) + elif backend == "open3d": + point_cloud_open3d(coordinates) + else: + print("No such backend") + + +def draw_confsion_matrix(confusion_matrix, label_db): + index = [i for i in range(confusion_matrix.shape[0])] + index = _remap_model_output(index, label_db) + column_names = np.full((len(index)), "empty") + for k, v in label_db.items(): + column_names[index == k] = v["name"] + df_cm = DataFrame( + confusion_matrix, index=column_names, columns=column_names + ) + # pretty_plot_confusion_matrix(df_cm, fz=9) + sns.heatmap( + df_cm, + annot=True, + fmt="d", + linewidths=0.25, + annot_kws={"size": 5}, + vmax=10000, + ) + buf = BytesIO() + plt.savefig(buf, format="jpg") + plt.close() + buf.seek(0) + image = imread(buf, format="jpg") + buf.close() + return image diff --git a/models/Mask3D/mask3d/utils/point_cloud_utils.py b/models/Mask3D/mask3d/utils/point_cloud_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7d2b5ec875da78d299c23afa70531cb0df04e278 --- /dev/null +++ b/models/Mask3D/mask3d/utils/point_cloud_utils.py @@ -0,0 +1,83 @@ +from pathlib import Path +from typing import List, Optional, Tuple + +import numpy as np +import open3d +from plyfile import PlyData, PlyElement + + +def load_ply(filepath): + with open(filepath, "rb") as f: + plydata = PlyData.read(f) + data = plydata.elements[0].data + coords = np.array([data["x"], data["y"], data["z"]], dtype=np.float32).T + feats = None + labels = None + if ({"red", "green", "blue"} - set(data.dtype.names)) == set(): + feats = np.array( + [data["red"], data["green"], data["blue"]], dtype=np.uint8 + ).T + if "label" in data.dtype.names: + labels = np.array(data["label"], dtype=np.uint32) + return coords, feats, labels + + +def load_ply_with_normals(filepath): + mesh = open3d.io.read_triangle_mesh(str(filepath)) + if not mesh.has_vertex_normals(): + mesh.compute_vertex_normals() + vertices = np.asarray(mesh.vertices) + normals = np.asarray(mesh.vertex_normals) + + coords, feats, labels = load_ply(filepath) + assert np.allclose(coords, vertices), "different coordinates" + feats = np.hstack((feats, normals)) + + return coords, feats, labels + + +def load_obj_with_normals(filepath): + mesh = open3d.io.read_triangle_mesh(str(filepath)) + if not mesh.has_vertex_normals(): + mesh.compute_vertex_normals() + coords = np.asarray(mesh.vertices) + normals = np.asarray(mesh.vertex_normals) + colors = np.asarray(mesh.vertex_colors) + feats = np.hstack((colors, normals)) + + return coords, feats + + +def write_point_cloud_in_ply( + filepath: Path, + coords: np.ndarray, + feats: Optional[np.ndarray] = None, + labels: Optional[np.ndarray] = None, + dtypes: Optional[List[Tuple[str, str]]] = [ + ("x", " +#include +#include +#include +#include "aggregation_cuda_kernel.h" + + +void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor) +{ + const float *input = input_tensor.data_ptr(); + const float *position = position_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + aggregation_forward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, output); +} + +void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor) +{ + const float *input = input_tensor.data_ptr(); + const float *position = position_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + const float *grad_output = grad_output_tensor.data_ptr(); + float *grad_input = grad_input_tensor.data_ptr(); + float *grad_position = grad_position_tensor.data_ptr(); + float *grad_weight = grad_weight_tensor.data_ptr(); + aggregation_backward_cuda_launcher(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/aggregation/aggregation_cuda_kernel.cu b/models/Mask3D/mask3d/utils/pointops2/src/aggregation/aggregation_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8339bb7e2088abffefba02c26b248edafed6cf47 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/aggregation/aggregation_cuda_kernel.cu @@ -0,0 +1,53 @@ +#include "../cuda_utils.h" +#include "aggregation_cuda_kernel.h" + + +__global__ void aggregation_forward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) { + // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * c) return; + const int c_idx = index % c; + const int n_idx = index / c; + const int w_c_idx = c_idx % w_c; + for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++) + { + int idx_idx = n_idx * nsample + nsample_idx; + int input_idx = idx[idx_idx] * c + c_idx; + int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx; + int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx; + output[index] += (input[input_idx] + position[position_idx]) * weight[weight_idx]; + } +} + +__global__ void aggregation_backward_cuda_kernel(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) { + // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * c) return; + const int c_idx = index % c; + const int n_idx = index / c; + const int w_c_idx = c_idx % w_c; + for (int nsample_idx = 0; nsample_idx < nsample; nsample_idx++) + { + int idx_idx = n_idx * nsample + nsample_idx; + int input_idx = idx[idx_idx] * c + c_idx; + int position_idx = n_idx * nsample * c + nsample_idx * c + c_idx; + int weight_idx = n_idx * nsample * w_c + nsample_idx * w_c + w_c_idx; + atomicAdd(grad_input + input_idx, grad_output[index] * weight[weight_idx]); + grad_position[position_idx] = grad_output[index] * weight[weight_idx]; + atomicAdd(grad_weight + weight_idx, grad_output[index] * (input[input_idx] + position[position_idx])); + } +} + +void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output) { + // input: input: (n, c), position: (n, nsample, c), weight: (n, nsample, w_c), idx: (n, nsample), output: (n, c) + dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + aggregation_forward_cuda_kernel<<>>(n, nsample, c, w_c, input, position, weight, idx, output); +} + +void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight) { + // input: grad_output: (n, c), output: grad_input: (n, c), grad_position: (n, nsample, c), grad_weight: (n, nsample, w_c) + dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + aggregation_backward_cuda_kernel<<>>(n, nsample, c, w_c, input, position, weight, idx, grad_output, grad_input, grad_position, grad_weight); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/aggregation/aggregation_cuda_kernel.h b/models/Mask3D/mask3d/utils/pointops2/src/aggregation/aggregation_cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..5211a96aa2acbe0d9baf32bddc9ab4be87703072 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/aggregation/aggregation_cuda_kernel.h @@ -0,0 +1,20 @@ +#ifndef _AGGREGATION_CUDA_KERNEL +#define _AGGREGATION_CUDA_KERNEL +#include +#include +#include + +void aggregation_forward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor output_tensor); +void aggregation_backward_cuda(int n, int nsample, int c, int w_c, at::Tensor input_tensor, at::Tensor position_tensor, at::Tensor weight_tensor, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input_tensor, at::Tensor grad_position_tensor, at::Tensor grad_weight_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void aggregation_forward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, float *output); +void aggregation_backward_cuda_launcher(int n, int nsample, int c, int w_c, const float *input, const float *position, const float *weight, const int *idx, const float *grad_output, float *grad_input, float *grad_position, float *grad_weight); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/attention/attention_cuda.cpp b/models/Mask3D/mask3d/utils/pointops2/src/attention/attention_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8d2c725ae0ed70c884a8643aa74ba0c0f6660d30 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/attention/attention_cuda.cpp @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include "attention_cuda_kernel.h" + +void attention_step1_forward_cuda(int N, int M, int h, int C, at::Tensor q_tensor, at::Tensor k_tensor, + at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor) +{ + const float *q = q_tensor.data_ptr(); + const float *k = k_tensor.data_ptr(); + const int *index0 = index0_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + float *attn = attn_tensor.data_ptr(); + attention_step1_forward_cuda_launcher(N, M, h, C, q, k, index0, index1, attn); +} + +void attention_step1_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, + at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, + at::Tensor grad_q_tensor, at::Tensor grad_k_tensor) +{ + const float *grad_out = grad_out_tensor.data_ptr(); + const int *index0 = index0_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + const float *q = q_tensor.data_ptr(); + const float *k = k_tensor.data_ptr(); + float *grad_q = grad_q_tensor.data_ptr(); + float *grad_k = grad_k_tensor.data_ptr(); + attention_step1_backward_cuda_launcher(N, M, h, C, grad_out, index0, index1, q, k, grad_q, grad_k); +} + +void attention_step2_forward_cuda(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, + at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor) +{ + const float *attn = attn_tensor.data_ptr(); + const float *v = v_tensor.data_ptr(); + const int *index0 = index0_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + attention_step2_forward_cuda_launcher(N, M, h, C, attn, v, index0, index1, output); +} + + +void attention_step2_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, + at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, + at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor) +{ + const float *grad_out = grad_out_tensor.data_ptr(); + const int *index0 = index0_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + const float *attn = attn_tensor.data_ptr(); + const float *v = v_tensor.data_ptr(); + float *grad_attn = grad_attn_tensor.data_ptr(); + float *grad_v = grad_v_tensor.data_ptr(); + attention_step2_backward_cuda_launcher(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/attention/attention_cuda_kernel.cu b/models/Mask3D/mask3d/utils/pointops2/src/attention/attention_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f71ad62987233229fcb547b30cfb7b9191683050 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/attention/attention_cuda_kernel.cu @@ -0,0 +1,103 @@ +#include "../cuda_utils.h" +#include "attention_cuda_kernel.h" + + +__global__ void attention_step1_forward_cuda_kernel( // M, h, C//h + int N, int M, int h, int C, const float *q, const float *k, + const int *index0, const int *index1, float *attn) { + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int m_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; + + int idx0 = index0[m_idx]; + int idx1 = index1[m_idx]; + float val = q[idx0*C+h_idx*C/h+c_idx] * k[idx1*C+h_idx*C/h+c_idx]; + atomicAdd(attn+m_idx*h+h_idx, val); +} + +__global__ void attention_step1_backward_cuda_kernel( // M, h, C//h + int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *q, const float *k, + float *grad_q, float *grad_k) { + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int m_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; + + int idx0 = index0[m_idx]; + int idx1 = index1[m_idx]; + int grad_out_idx = m_idx*h+h_idx; + int q_idx = idx0*C+h_idx*C/h+c_idx; + int k_idx = idx1*C+h_idx*C/h+c_idx; + atomicAdd(grad_q+q_idx, grad_out[grad_out_idx] * k[k_idx]); + atomicAdd(grad_k+k_idx, grad_out[grad_out_idx] * q[q_idx]); +} + +void attention_step1_forward_cuda_launcher(int N, int M, int h, int C, const float *q, const float *k, + const int *index0, const int *index1, float *attn) { + // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, ) + //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); + dim3 threads(THREADS_PER_BLOCK); + attention_step1_forward_cuda_kernel<<>>(N, M, h, C, q, k, index0, index1, attn); +} + +void attention_step1_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, + const float *q, const float *k, float *grad_q, float *grad_k) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); + dim3 threads(THREADS_PER_BLOCK); + attention_step1_backward_cuda_kernel<<>>(N, M, h, C, grad_out, index0, index1, q, k, grad_q, grad_k); +} + +__global__ void attention_step2_forward_cuda_kernel( // M, h, C//h + int N, int M, int h, int C, const float *attn, const float *v, + const int *index0, const int *index1, float *output) { + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int m_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; + + int idx1 = index1[m_idx]; + float val = attn[m_idx*h+h_idx] * v[idx1*C+h_idx*C/h+c_idx]; + int idx0 = index0[m_idx]; + atomicAdd(output+idx0*C+h_idx*C/h+c_idx, val); +} + +__global__ void attention_step2_backward_cuda_kernel( // M, h, C//h + int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, + float *grad_attn, float *grad_v) { + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int m_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; + + int idx0 = index0[m_idx]; + int idx1 = index1[m_idx]; + int grad_out_idx = idx0*C+h_idx*C/h+c_idx; + atomicAdd(grad_attn+m_idx*h+h_idx, grad_out[grad_out_idx] * v[idx1*C+h_idx*C/h+c_idx]); + atomicAdd(grad_v+idx1*C+h_idx*C/h+c_idx, grad_out[grad_out_idx] * attn[m_idx*h+h_idx]); +} + +void attention_step2_forward_cuda_launcher(int N, int M, int h, int C, const float *attn, const float *v, + const int *index0, const int *index1, float *output) { + // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, ) + //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); + dim3 threads(THREADS_PER_BLOCK); + attention_step2_forward_cuda_kernel<<>>(N, M, h, C, attn, v, index0, index1, output); +} + +void attention_step2_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, + const float *attn, const float *v, float *grad_attn, float *grad_v) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); + dim3 threads(THREADS_PER_BLOCK); + attention_step2_backward_cuda_kernel<<>>(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/attention/attention_cuda_kernel.h b/models/Mask3D/mask3d/utils/pointops2/src/attention/attention_cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..cbd99b9b6a9c65af76aa95d00fff6306446114cd --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/attention/attention_cuda_kernel.h @@ -0,0 +1,26 @@ +#ifndef _ATTENTION_CUDA_KERNEL +#define _ATTENTION_CUDA_KERNEL +#include +#include +#include + +void attention_step1_forward_cuda(int N, int M, int h, int C, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor); +void attention_step1_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor); + +void attention_step2_forward_cuda(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor); +void attention_step2_backward_cuda(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void attention_step1_forward_cuda_launcher(int N, int M, int h, int C, const float *q, const float *k, const int *index0, const int *index1, float *attn); +void attention_step1_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k); + +void attention_step2_forward_cuda_launcher(int N, int M, int h, int C, const float *attn, const float *v, const int *index0, const int *index1, float *output); +void attention_step2_backward_cuda_launcher(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, float *grad_attn, float *grad_v); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.cu b/models/Mask3D/mask3d/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.cu new file mode 100644 index 0000000000000000000000000000000000000000..2e5343f5a3a0ad52aae7d06d22989f04390b68f6 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.cu @@ -0,0 +1,193 @@ +#include "../cuda_utils.h" +#include "attention_cuda_kernel_v2.h" + + +template +__global__ void attention_step1_forward_cuda_kernel_v2( // M, h, C//h + int N, int M, int h, const float *q, const float *k, + const int *index0_offsets, const int *index1, float *attn) { + + int h_idx = blockIdx.y; + int q_idx = blockIdx.x; + int n_idx = threadIdx.x; + int C = h * d; + // if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; + + __shared__ float query_vec[d]; + __shared__ int start, end; + + // if(n_idx == 0){ + // printf("blockDim.x: %d\n", blockDim.x); + // } + + if (n_idx == 0){ + start = index0_offsets[q_idx]; + end = index0_offsets[q_idx+1]; + // printf("start: %d, end: %d, blockDim.x: %d\n", start, end, blockDim.x); + } + for(int i = n_idx; i < d; i += blockDim.x) + query_vec[i] = q[q_idx*C + h_idx*d + i]; + + __syncthreads(); + + int m_idx = start + n_idx; + if(m_idx >= end) + return; + + float sum = 0; + for(int i = 0; i < d; i++){ + int k_idx = index1[m_idx]; + float key = k[k_idx * C + h_idx * d + i]; + sum += query_vec[i] * key; + } + attn[m_idx*h + h_idx] = sum; + // int idx0 = index0[m_idx]; + // int idx1 = index1[m_idx]; + // float val = q[idx0*C+h_idx*C/h+c_idx] * k[idx1*C+h_idx*C/h+c_idx]; + // atomicAdd(attn+m_idx*h+h_idx, val); +} + +template +__global__ void attention_step1_backward_cuda_kernel_v2( // M, h, C//h + int N, int M, int h, const float *grad_out, const int *index0_offsets, const int *index1, const float *q, const float *k, + float *grad_q, float *grad_k) { + + int h_idx = blockIdx.y; + int q_idx = blockIdx.x; + int n_idx = threadIdx.x; + int C = d * h; + + __shared__ float query_vec[d]; + __shared__ int start, end; + + if (n_idx == 0){ + start = index0_offsets[q_idx]; + end = index0_offsets[q_idx+1]; + } + for(int i = n_idx; i < d; i += blockDim.x) + query_vec[i] = q[q_idx*C + h_idx*d + i]; + + __shared__ float gradient_new[d]; + for(int i = n_idx; i < d; i += blockDim.x) + gradient_new[i] = 0; + + __syncthreads(); + + int m_idx = start + n_idx; + if(m_idx < end){ + float gradient = grad_out[m_idx*h + h_idx]; + for(int i = 0; i < d; i++){ + int k_idx = index1[m_idx]; + atomicAdd(&gradient_new[i], gradient * k[k_idx*C + h_idx*d + i]); + atomicAdd(grad_k + k_idx*C + h_idx*d + i, gradient * query_vec[i]); + } + } + __syncthreads(); + + for(int i = n_idx; i < d; i += blockDim.x) + grad_q[q_idx*C + h_idx*d + i] = gradient_new[i]; +} + +void attention_step1_forward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, + const float *q, const float *k, const int *index0_offsets, const int *index1, float *attn) { + // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, ) + //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); + dim3 blocks(N, h); + unsigned int n_threads = opt_n_threads(n_max); + + n_threads = n_threads == n_max ? n_threads : n_threads * 2; + // n_threads = n_threads > 1024 ? 512 : n_threads; + + // printf("n_max: %d, n_threads: %d\n", n_max, n_threads); + + // dim3 threads(THREADS_PER_BLOCK); + // attention_step1_forward_cuda_kernel_v2<<>>(N, M, h, C, q, k, index0, index1, attn); + + switch (C / h) { + case 16: + attention_step1_forward_cuda_kernel_v2<16><<>>(N, M, h, q, k, index0_offsets, index1, attn); + break; + case 32: + attention_step1_forward_cuda_kernel_v2<32><<>>(N, M, h, q, k, index0_offsets, index1, attn); + break; + default: + throw "d != 16 and d != 32"; + } +} + +void attention_step1_backward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, + const float *grad_out, const int *index0_offsets, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); + // dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); + // dim3 threads(THREADS_PER_BLOCK); + dim3 blocks(N, h); + unsigned int n_threads = opt_n_threads(n_max); + // attention_step1_backward_cuda_kernel_v2<<>>(N, M, h, C/h, grad_out, index0_offsets, index1, q, k, grad_q, grad_k); + + n_threads = n_threads == n_max ? n_threads : n_threads * 2; + // n_threads = n_threads > 1024 ? 512 : n_threads; + + // printf("n_max: %d, n_threads: %d\n", n_max, n_threads); + + switch (C / h) { + case 16: + attention_step1_backward_cuda_kernel_v2<16><<>>(N, M, h, grad_out, index0_offsets, index1, q, k, grad_q, grad_k); + break; + case 32: + attention_step1_backward_cuda_kernel_v2<32><<>>(N, M, h, grad_out, index0_offsets, index1, q, k, grad_q, grad_k); + break; + default: + throw "d != 16 and d != 32"; + } + +} + +__global__ void attention_step2_forward_cuda_kernel_v2( // M, h, C//h + int N, int M, int h, int C, const float *attn, const float *v, + const int *index0, const int *index1, float *output) { + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int m_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; + + int idx1 = index1[m_idx]; + float val = attn[m_idx*h+h_idx] * v[idx1*C+h_idx*C/h+c_idx]; + int idx0 = index0[m_idx]; + atomicAdd(output+idx0*C+h_idx*C/h+c_idx, val); +} + +__global__ void attention_step2_backward_cuda_kernel_v2( // M, h, C//h + int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, + float *grad_attn, float *grad_v) { + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int m_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (m_idx >= M || h_idx >= h || c_idx >= C / h) return; + + int idx0 = index0[m_idx]; + int idx1 = index1[m_idx]; + int grad_out_idx = idx0*C+h_idx*C/h+c_idx; + atomicAdd(grad_attn+m_idx*h+h_idx, grad_out[grad_out_idx] * v[idx1*C+h_idx*C/h+c_idx]); + atomicAdd(grad_v+idx1*C+h_idx*C/h+c_idx, grad_out[grad_out_idx] * attn[m_idx*h+h_idx]); +} + +void attention_step2_forward_cuda_launcher_v2(int N, int M, int h, int C, const float *attn, const float *v, + const int *index0, const int *index1, float *output) { + // input: attn: (M, h), v: (N, h, C/h), index0: (M, ), index1: (M, ) + //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); + dim3 threads(THREADS_PER_BLOCK); + attention_step2_forward_cuda_kernel_v2<<>>(N, M, h, C, attn, v, index0, index1, output); +} + +void attention_step2_backward_cuda_launcher_v2(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, + const float *attn, const float *v, float *grad_attn, float *grad_v) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + //dim3 blocks(DIVUP(C/h, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), h, C/h); + dim3 threads(THREADS_PER_BLOCK); + attention_step2_backward_cuda_kernel_v2<<>>(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.h b/models/Mask3D/mask3d/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.h new file mode 100644 index 0000000000000000000000000000000000000000..d7e7f047bc318928ddb9402acbcdf20204596450 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/attention_v2/attention_cuda_kernel_v2.h @@ -0,0 +1,26 @@ +#ifndef _ATTENTION_V2_CUDA_KERNEL +#define _ATTENTION_V2_CUDA_KERNEL +#include +#include +#include + +void attention_step1_forward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor attn_tensor); +void attention_step1_backward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor grad_out_tensor, at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor); + +void attention_step2_forward_cuda_v2(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor); +void attention_step2_backward_cuda_v2(int N, int M, int h, int C, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void attention_step1_forward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, const float *q, const float *k, const int *index0_offsets, const int *index1, float *attn); +void attention_step1_backward_cuda_launcher_v2(int N, int M, int h, int C, const unsigned int n_max, const float *grad_out, const int *index0_offsets, const int *index1, const float *q, const float *k, float *grad_q, float *grad_k); + +void attention_step2_forward_cuda_launcher_v2(int N, int M, int h, int C, const float *attn, const float *v, const int *index0, const int *index1, float *output); +void attention_step2_backward_cuda_launcher_v2(int N, int M, int h, int C, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, float *grad_attn, float *grad_v); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/attention_v2/attention_cuda_v2.cpp b/models/Mask3D/mask3d/utils/pointops2/src/attention_v2/attention_cuda_v2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..311adaf223928f83f3f238268fe0f189b5479657 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/attention_v2/attention_cuda_v2.cpp @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include "attention_cuda_kernel_v2.h" + +void attention_step1_forward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor q_tensor, at::Tensor k_tensor, + at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor attn_tensor) +{ + const float *q = q_tensor.data_ptr(); + const float *k = k_tensor.data_ptr(); + const int *index0_offsets = index0_tensor_offsets.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + float *attn = attn_tensor.data_ptr(); + attention_step1_forward_cuda_launcher_v2(N, M, h, C, n_max, q, k, index0_offsets, index1, attn); +} + +void attention_step1_backward_cuda_v2(int N, int M, int h, int C, const unsigned int n_max, at::Tensor grad_out_tensor, + at::Tensor index0_tensor_offsets, at::Tensor index1_tensor, at::Tensor q_tensor, at::Tensor k_tensor, + at::Tensor grad_q_tensor, at::Tensor grad_k_tensor) +{ + const float *grad_out = grad_out_tensor.data_ptr(); + const int *index0_offsets = index0_tensor_offsets.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + const float *q = q_tensor.data_ptr(); + const float *k = k_tensor.data_ptr(); + float *grad_q = grad_q_tensor.data_ptr(); + float *grad_k = grad_k_tensor.data_ptr(); + attention_step1_backward_cuda_launcher_v2(N, M, h, C, n_max, grad_out, index0_offsets, index1, q, k, grad_q, grad_k); +} + +void attention_step2_forward_cuda_v2(int N, int M, int h, int C, at::Tensor attn_tensor, at::Tensor v_tensor, + at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor output_tensor) +{ + const float *attn = attn_tensor.data_ptr(); + const float *v = v_tensor.data_ptr(); + const int *index0 = index0_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + attention_step2_forward_cuda_launcher_v2(N, M, h, C, attn, v, index0, index1, output); +} + + +void attention_step2_backward_cuda_v2(int N, int M, int h, int C, at::Tensor grad_out_tensor, + at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, + at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor) +{ + const float *grad_out = grad_out_tensor.data_ptr(); + const int *index0 = index0_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + const float *attn = attn_tensor.data_ptr(); + const float *v = v_tensor.data_ptr(); + float *grad_attn = grad_attn_tensor.data_ptr(); + float *grad_v = grad_v_tensor.data_ptr(); + attention_step2_backward_cuda_launcher_v2(N, M, h, C, grad_out, index0, index1, attn, v, grad_attn, grad_v); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/cuda_utils.h b/models/Mask3D/mask3d/utils/pointops2/src/cuda_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..e67749c4f5f8964ffb5916c13f5260cf8df45f52 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/cuda_utils.h @@ -0,0 +1,23 @@ +#ifndef _CUDA_UTILS_H +#define _CUDA_UTILS_H + +#include +#include + +#define TOTAL_THREADS 1024 +#define THREADS_PER_BLOCK 256 +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) + +inline int opt_n_threads(int work_size) { + const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); + return std::max(std::min(1 << pow_2, TOTAL_THREADS), 1); +} + +inline dim3 opt_block_config(int x, int y) { + const int x_threads = opt_n_threads(x); + const int y_threads = std::max(std::min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); + dim3 block_config(x_threads, y_threads, 1); + return block_config; +} + +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/grouping/grouping_cuda.cpp b/models/Mask3D/mask3d/utils/pointops2/src/grouping/grouping_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a00d3139db5a3b58261c825c4a9e46e168fea8ce --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/grouping/grouping_cuda.cpp @@ -0,0 +1,22 @@ +#include +#include +#include +#include +#include "grouping_cuda_kernel.h" + + +void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor) +{ + const float *input = input_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + grouping_forward_cuda_launcher(m, nsample, c, input, idx, output); +} + +void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor) +{ + const float *grad_output = grad_output_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + float *grad_input = grad_input_tensor.data_ptr(); + grouping_backward_cuda_launcher(m, nsample, c, grad_output, idx, grad_input); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/grouping/grouping_cuda_kernel.cu b/models/Mask3D/mask3d/utils/pointops2/src/grouping/grouping_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..58ec0a21a2949f9f82504ccd24597c544c50af40 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/grouping/grouping_cuda_kernel.cu @@ -0,0 +1,40 @@ +#include "../cuda_utils.h" +#include "grouping_cuda_kernel.h" + + +__global__ void grouping_forward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ input, const int *__restrict__ idx, float *__restrict__ output) { + // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= m * nsample * c) return; + const int c_idx = index % c; + const int nsample_idx = (index / c) % nsample; + const int m_idx = index / nsample / c; + const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx; + output[index] = input[input_idx]; +} + +__global__ void grouping_backward_cuda_kernel(int m, int nsample, int c, const float *__restrict__ grad_output, const int *__restrict__ idx, float *__restrict__ grad_input) { + // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= m * nsample * c) return; + const int c_idx = index % c; + const int nsample_idx = (index / c) % nsample; + const int m_idx = index / nsample / c; + const int input_idx = idx[m_idx * nsample + nsample_idx] * c + c_idx; + atomicAdd(grad_input + input_idx, grad_output[index]); +} + +void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output) { + // input: input: (n, c), idx: (m, nsample), output: (m, nsample, c) + dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + grouping_forward_cuda_kernel<<>>(m, nsample, c, input, idx, output); +} + +void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input) +{ + // input: grad_output: (m, nsample, c), idx: (m, nsample), output: grad_input: (n, c) + dim3 blocks(DIVUP(m * nsample * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + grouping_backward_cuda_kernel<<>>(m, nsample, c, grad_output, idx, grad_input); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/grouping/grouping_cuda_kernel.h b/models/Mask3D/mask3d/utils/pointops2/src/grouping/grouping_cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..3db4aaa9fad5811d559d47c500e4b00f0165d9b4 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/grouping/grouping_cuda_kernel.h @@ -0,0 +1,20 @@ +#ifndef _GROUPING_CUDA_KERNEL +#define _GROUPING_CUDA_KERNEL +#include +#include +#include + +void grouping_forward_cuda(int m, int nsample, int c, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor output_tensor); +void grouping_backward_cuda(int m, int nsample, int c, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor grad_input_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void grouping_forward_cuda_launcher(int m, int nsample, int c, const float *input, const int *idx, float *output); +void grouping_backward_cuda_launcher(int m, int nsample, int c, const float *grad_output, const int *idx, float *grad_input); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/interpolation/interpolation_cuda.cpp b/models/Mask3D/mask3d/utils/pointops2/src/interpolation/interpolation_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a73c02b1193330af8e0bc66093749126561700b3 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/interpolation/interpolation_cuda.cpp @@ -0,0 +1,24 @@ +#include +#include +#include +#include +#include "interpolation_cuda_kernel.h" + + +void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor) +{ + const float *input = input_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + interpolation_forward_cuda_launcher(n, c, k, input, idx, weight, output); +} + +void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor) +{ + const float *grad_output = grad_output_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + const float *weight = weight_tensor.data_ptr(); + float *grad_input = grad_input_tensor.data_ptr(); + interpolation_backward_cuda_launcher(n, c, k, grad_output, idx, weight, grad_input); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/interpolation/interpolation_cuda_kernel.cu b/models/Mask3D/mask3d/utils/pointops2/src/interpolation/interpolation_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f560d8c92c6eac865b8c1e1dc27140fe3fcc2250 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/interpolation/interpolation_cuda_kernel.cu @@ -0,0 +1,47 @@ +#include "../cuda_utils.h" +#include "interpolation_cuda_kernel.h" + + +__global__ void interpolation_forward_cuda_kernel(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) +{ + // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * c) return; + int c_idx = index % c; + int n_idx = index / c; + for (int i = 0; i < k; i++) + { + int idx_idx = n_idx * k + i; + int input_idx = idx[idx_idx] * c + c_idx; + output[index] += input[input_idx] * weight[idx_idx]; + } +} + +__global__ void interpolation_backward_cuda_kernel(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) +{ + // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * c) return; + int c_idx = index % c; + int n_idx = index / c; + for (int i = 0; i < k; i++) + { + int idx_idx = n_idx * k + i; + int input_idx = idx[idx_idx] * c + c_idx; + atomicAdd(grad_input + input_idx, grad_output[index] * weight[idx_idx]); + } +} + +void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output) { + // input: input: (m, c), idx: (n, k), weight: (n, k), output: output (n, c) + dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + interpolation_forward_cuda_kernel<<>>(n, c, k, input, idx, weight, output); +} + +void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input) { + // input: grad_output: (n, c), idx: (n, k), weight: (n, k), output: grad_input (m, c) + dim3 blocks(DIVUP(n * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + interpolation_backward_cuda_kernel<<>>(n, c, k, grad_output, idx, weight, grad_input); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/interpolation/interpolation_cuda_kernel.h b/models/Mask3D/mask3d/utils/pointops2/src/interpolation/interpolation_cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..309e5dd0a34ccb58807bbf32389ba65e7ee6961b --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/interpolation/interpolation_cuda_kernel.h @@ -0,0 +1,20 @@ +#ifndef _INTERPOLATION_CUDA_KERNEL +#define _INTERPOLATION_CUDA_KERNEL +#include +#include +#include + +void interpolation_forward_cuda(int n, int c, int k, at::Tensor input_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor output_tensor); +void interpolation_backward_cuda(int n, int c, int k, at::Tensor grad_output_tensor, at::Tensor idx_tensor, at::Tensor weight_tensor, at::Tensor grad_input_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void interpolation_forward_cuda_launcher(int n, int c, int k, const float *input, const int *idx, const float *weight, float *output); +void interpolation_backward_cuda_launcher(int n, int c, int k, const float *grad_output, const int *idx, const float *weight, float *grad_input); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/knnquery/knnquery_cuda.cpp b/models/Mask3D/mask3d/utils/pointops2/src/knnquery/knnquery_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..568f1366f65dda9f57f037212a46d2552806e79f --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/knnquery/knnquery_cuda.cpp @@ -0,0 +1,17 @@ +#include +#include +#include +#include +#include "knnquery_cuda_kernel.h" + + +void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor) +{ + const float *xyz = xyz_tensor.data_ptr(); + const float *new_xyz = new_xyz_tensor.data_ptr(); + const int *offset = offset_tensor.data_ptr(); + const int *new_offset = new_offset_tensor.data_ptr(); + int *idx = idx_tensor.data_ptr(); + float *dist2 = dist2_tensor.data_ptr(); + knnquery_cuda_launcher(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/knnquery/knnquery_cuda_kernel.cu b/models/Mask3D/mask3d/utils/pointops2/src/knnquery/knnquery_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..83762bc0110e38c7b5fa8adf0ef4ce255bc9d0b9 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/knnquery/knnquery_cuda_kernel.cu @@ -0,0 +1,116 @@ +#include "../cuda_utils.h" +#include "knnquery_cuda_kernel.h" + + +__device__ void swap_float(float *x, float *y) +{ + float tmp = *x; + *x = *y; + *y = tmp; +} + + +__device__ void swap_int(int *x, int *y) +{ + int tmp = *x; + *x = *y; + *y = tmp; +} + + +__device__ void reheap(float *dist, int *idx, int k) +{ + int root = 0; + int child = root * 2 + 1; + while (child < k) + { + if(child + 1 < k && dist[child+1] > dist[child]) + child++; + if(dist[root] > dist[child]) + return; + swap_float(&dist[root], &dist[child]); + swap_int(&idx[root], &idx[child]); + root = child; + child = root * 2 + 1; + } +} + + +__device__ void heap_sort(float *dist, int *idx, int k) +{ + int i; + for (i = k - 1; i > 0; i--) + { + swap_float(&dist[0], &dist[i]); + swap_int(&idx[0], &idx[i]); + reheap(dist, idx, i); + } +} + + +__device__ int get_bt_idx(int idx, const int *offset) +{ + int i = 0; + while (1) + { + if (idx < offset[i]) + break; + else + i++; + } + return i; +} + + +__global__ void knnquery_cuda_kernel(int m, int nsample, const float *__restrict__ xyz, const float *__restrict__ new_xyz, const int *__restrict__ offset, const int *__restrict__ new_offset, int *__restrict__ idx, float *__restrict__ dist2) { + // input: xyz (n, 3) new_xyz (m, 3) + // output: idx (m, nsample) dist2 (m, nsample) + int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (pt_idx >= m) return; + + new_xyz += pt_idx * 3; + idx += pt_idx * nsample; + dist2 += pt_idx * nsample; + int bt_idx = get_bt_idx(pt_idx, new_offset); + int start; + if (bt_idx == 0) + start = 0; + else + start = offset[bt_idx - 1]; + int end = offset[bt_idx]; + + float new_x = new_xyz[0]; + float new_y = new_xyz[1]; + float new_z = new_xyz[2]; + + float best_dist[100]; + int best_idx[100]; + for(int i = 0; i < nsample; i++){ + best_dist[i] = 1e10; + best_idx[i] = start; + } + for(int i = start; i < end; i++){ + float x = xyz[i * 3 + 0]; + float y = xyz[i * 3 + 1]; + float z = xyz[i * 3 + 2]; + float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); + if (d2 < best_dist[0]){ + best_dist[0] = d2; + best_idx[0] = i; + reheap(best_dist, best_idx, nsample); + } + } + heap_sort(best_dist, best_idx, nsample); + for(int i = 0; i < nsample; i++){ + idx[i] = best_idx[i]; + dist2[i] = best_dist[i]; + } +} + + +void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2) { + // input: new_xyz: (m, 3), xyz: (n, 3), idx: (m, nsample) + dim3 blocks(DIVUP(m, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + knnquery_cuda_kernel<<>>(m, nsample, xyz, new_xyz, offset, new_offset, idx, dist2); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/knnquery/knnquery_cuda_kernel.h b/models/Mask3D/mask3d/utils/pointops2/src/knnquery/knnquery_cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..3c0aedfe8fbe6c427ee15bb550c2c1829e9f4b97 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/knnquery/knnquery_cuda_kernel.h @@ -0,0 +1,18 @@ +#ifndef _KNNQUERY_CUDA_KERNEL +#define _KNNQUERY_CUDA_KERNEL +#include +#include +#include + +void knnquery_cuda(int m, int nsample, at::Tensor xyz_tensor, at::Tensor new_xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor idx_tensor, at::Tensor dist2_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void knnquery_cuda_launcher(int m, int nsample, const float *xyz, const float *new_xyz, const int *offset, const int *new_offset, int *idx, float *dist2); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/pointops_api.cpp b/models/Mask3D/mask3d/utils/pointops2/src/pointops_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..812789f7d4fdf961b960641ba6c2fd660c16a654 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/pointops_api.cpp @@ -0,0 +1,45 @@ +#include +#include + +#include "knnquery/knnquery_cuda_kernel.h" +#include "sampling/sampling_cuda_kernel.h" +#include "grouping/grouping_cuda_kernel.h" +#include "interpolation/interpolation_cuda_kernel.h" +#include "aggregation/aggregation_cuda_kernel.h" +#include "subtraction/subtraction_cuda_kernel.h" +#include "attention/attention_cuda_kernel.h" +#include "rpe/relative_pos_encoding_cuda_kernel.h" +#include "attention_v2/attention_cuda_kernel_v2.h" +#include "rpe_v2/relative_pos_encoding_cuda_kernel_v2.h" + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("knnquery_cuda", &knnquery_cuda, "knnquery_cuda"); + m.def("furthestsampling_cuda", &furthestsampling_cuda, "furthestsampling_cuda"); + m.def("grouping_forward_cuda", &grouping_forward_cuda, "grouping_forward_cuda"); + m.def("grouping_backward_cuda", &grouping_backward_cuda, "grouping_backward_cuda"); + m.def("interpolation_forward_cuda", &interpolation_forward_cuda, "interpolation_forward_cuda"); + m.def("interpolation_backward_cuda", &interpolation_backward_cuda, "interpolation_backward_cuda"); + m.def("subtraction_forward_cuda", &subtraction_forward_cuda, "subtraction_forward_cuda"); + m.def("subtraction_backward_cuda", &subtraction_backward_cuda, "subtraction_backward_cuda"); + m.def("aggregation_forward_cuda", &aggregation_forward_cuda, "aggregation_forward_cuda"); + m.def("aggregation_backward_cuda", &aggregation_backward_cuda, "aggregation_backward_cuda"); + m.def("attention_step1_forward_cuda", &attention_step1_forward_cuda, "attention_step1_forward_cuda"); + m.def("attention_step1_backward_cuda", &attention_step1_backward_cuda, "attention_step1_backward_cuda"); + m.def("attention_step2_forward_cuda", &attention_step2_forward_cuda, "attention_step2_forward_cuda"); + m.def("attention_step2_backward_cuda", &attention_step2_backward_cuda, "attention_step2_backward_cuda"); + m.def("dot_prod_with_idx_forward_cuda", &dot_prod_with_idx_forward_cuda, "dot_prod_with_idx_forward_cuda"); + m.def("dot_prod_with_idx_backward_cuda", &dot_prod_with_idx_backward_cuda, "dot_prod_with_idx_backward_cuda"); + m.def("attention_step2_with_rel_pos_value_forward_cuda", &attention_step2_with_rel_pos_value_forward_cuda, "attention_step2_with_rel_pos_value_forward_cuda"); + m.def("attention_step2_with_rel_pos_value_backward_cuda", &attention_step2_with_rel_pos_value_backward_cuda, "attention_step2_with_rel_pos_value_backward_cuda"); + m.def("attention_step1_forward_cuda_v2", &attention_step1_forward_cuda_v2, "attention_step1_forward_cuda_v2"); + m.def("attention_step1_backward_cuda_v2", &attention_step1_backward_cuda_v2, "attention_step1_backward_cuda_v2"); + m.def("attention_step2_forward_cuda_v2", &attention_step2_forward_cuda_v2, "attention_step2_forward_cuda_v2"); + m.def("attention_step2_backward_cuda_v2", &attention_step2_backward_cuda_v2, "attention_step2_backward_cuda_v2"); + m.def("dot_prod_with_idx_forward_cuda_v2", &dot_prod_with_idx_forward_cuda_v2, "dot_prod_with_idx_forward_cuda_v2"); + m.def("dot_prod_with_idx_backward_cuda_v2", &dot_prod_with_idx_backward_cuda_v2, "dot_prod_with_idx_backward_cuda_v2"); + m.def("attention_step2_with_rel_pos_value_forward_cuda_v2", &attention_step2_with_rel_pos_value_forward_cuda_v2, "attention_step2_with_rel_pos_value_forward_cuda_v2"); + m.def("attention_step2_with_rel_pos_value_backward_cuda_v2", &attention_step2_with_rel_pos_value_backward_cuda_v2, "attention_step2_with_rel_pos_value_backward_cuda_v2"); + m.def("dot_prod_with_idx_forward_cuda_v3", &dot_prod_with_idx_forward_cuda_v3, "dot_prod_with_idx_forward_cuda_v3"); + m.def("dot_prod_with_idx_backward_cuda_v3", &dot_prod_with_idx_backward_cuda_v3, "dot_prod_with_idx_backward_cuda_v3"); + } diff --git a/models/Mask3D/mask3d/utils/pointops2/src/rpe/relative_pos_encoding_cuda.cpp b/models/Mask3D/mask3d/utils/pointops2/src/rpe/relative_pos_encoding_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..634ebb07520a0bd6fbcdf856679cc908eb2bec40 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/rpe/relative_pos_encoding_cuda.cpp @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include "relative_pos_encoding_cuda_kernel.h" + +void dot_prod_with_idx_forward_cuda(int N, int M, int h, int hdim, at::Tensor q_tensor, at::Tensor index_tensor, + at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor) +{ + const float *q = q_tensor.data_ptr(); + const float *table = table_tensor.data_ptr(); + const int *index = index_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + dot_prod_with_idx_forward_cuda_launcher(N, M, h, hdim, q, index, table, rel_idx, output); +} + +void dot_prod_with_idx_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, + at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, + at::Tensor grad_q_tensor, at::Tensor grad_table_tensor) +{ + const float *grad_out = grad_out_tensor.data_ptr(); + const float *q = q_tensor.data_ptr(); + const int *index = index_tensor.data_ptr(); + const float *table = table_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + float *grad_q = grad_q_tensor.data_ptr(); + float *grad_table = grad_table_tensor.data_ptr(); + dot_prod_with_idx_backward_cuda_launcher(N, M, h, hdim, grad_out, q, index, table, rel_idx, grad_q, grad_table); +} + +void attention_step2_with_rel_pos_value_forward_cuda(int N, int M, int h, int hdim, at::Tensor attn_tensor, at::Tensor v_tensor, + at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor) +{ + const float *attn = attn_tensor.data_ptr(); + const float *v = v_tensor.data_ptr(); + const int *index0 = index0_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + const float *table = table_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + attention_step2_with_rel_pos_value_forward_cuda_launcher(N, M, h, hdim, attn, v, index0, index1, table, rel_idx, output); +} + +void attention_step2_with_rel_pos_value_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, + at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, + at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor) +{ + const float *grad_out = grad_out_tensor.data_ptr(); + const int *index0 = index0_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + const float *attn = attn_tensor.data_ptr(); + const float *v = v_tensor.data_ptr(); + const float *table = table_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + float *grad_attn = grad_attn_tensor.data_ptr(); + float *grad_v = grad_v_tensor.data_ptr(); + float *grad_table = grad_table_tensor.data_ptr(); + attention_step2_with_rel_pos_value_backward_cuda_launcher(N, M, h, hdim, grad_out, index0, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.cu b/models/Mask3D/mask3d/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..b8fd8f42116ae0487c741c9b856c10c491f215f9 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.cu @@ -0,0 +1,134 @@ +#include "../cuda_utils.h" +#include "relative_pos_encoding_cuda_kernel.h" + + +__global__ void dot_prod_with_idx_forward_cuda_kernel( // M, h, hdim + int N, int M, int h, int hdim, const float *q, const int *index, + const float *table, const int *rel_idx, float *output) { + // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3), output: (M, h) + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return; + + int dim = thread_idx % 3; + int m_idx = thread_idx / 3; + + int q_idx = index[m_idx]; + int rel_idx_dim = rel_idx[thread_idx]; + float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim]; + float val = q[q_idx*h*hdim+h_idx*hdim+c_idx] * rel_table_val; + atomicAdd(output+m_idx*h+h_idx, val); +} + +__global__ void dot_prod_with_idx_backward_cuda_kernel( // M, h, hdim + int N, int M, int h, int hdim, const float *grad_out, const float *q, const int *index, + const float *table, const int *rel_idx, float *grad_q, float *grad_table) { + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return; + + int dim = thread_idx % 3; + int m_idx = thread_idx / 3; + + int q_idx = index[m_idx]; + int rel_idx_dim = rel_idx[thread_idx]; + int grad_out_idx = m_idx*h+h_idx; + float grad_out_value = grad_out[grad_out_idx]; + + float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim]; + atomicAdd(grad_q+q_idx*h*hdim+h_idx*hdim+c_idx, grad_out_value * rel_table_val); + + float q_value = q[q_idx*h*hdim+h_idx*hdim+c_idx]; + atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * q_value); +} + +void dot_prod_with_idx_forward_cuda_launcher(int N, int M, int h, int hdim, const float *q, const int *index, + const float *table, const int *rel_idx, float *output) { + // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); + dim3 threads(THREADS_PER_BLOCK); + dot_prod_with_idx_forward_cuda_kernel<<>>(N, M, h, hdim, q, index, table, rel_idx, output); +} + +void dot_prod_with_idx_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, + const float *q, const int *index, const float *table, const int *rel_idx, float *grad_q, float *grad_table) { + // input: grad_out: (M, h), output: grad_q: (N, h, hdim), grad_table: (L, h, hdim, 3) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); + dim3 threads(THREADS_PER_BLOCK); + dot_prod_with_idx_backward_cuda_kernel<<>>(N, M, h, hdim, grad_out, q, index, table, rel_idx, grad_q, grad_table); +} + +__global__ void attention_step2_with_rel_pos_value_forward_cuda_kernel( // M, h, hdim + int N, int M, int h, int hdim, const float *attn, const float *v, + const int *index0, const int *index1, const float *table, const int *rel_idx, float *output) { + // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3) + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return; + + int dim = thread_idx % 3; + int m_idx = thread_idx / 3; + + int idx1 = index1[m_idx]; + + int rel_idx_dim = rel_idx[thread_idx]; + float table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim]; + + float val = attn[m_idx*h+h_idx] * (v[idx1*h*hdim+h_idx*hdim+c_idx] / 3.0 + table_val); + + int idx0 = index0[m_idx]; + atomicAdd(output+idx0*h*hdim+h_idx*hdim+c_idx, val); +} + + +__global__ void attention_step2_with_rel_pos_value_backward_cuda_kernel( // M, h, hdim + int N, int M, int h, int hdim, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, const float *table, + const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) { + // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3) + + int c_idx = blockIdx.z; + int h_idx = blockIdx.y; + int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return; + + int dim = thread_idx % 3; + int m_idx = thread_idx / 3; + + int idx0 = index0[m_idx]; + int idx1 = index1[m_idx]; + int grad_out_idx = idx0*h*hdim+h_idx*hdim+c_idx; + + int rel_idx_dim = rel_idx[thread_idx]; + float table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim]; + float grad_out_value = grad_out[grad_out_idx]; + + atomicAdd(grad_attn+m_idx*h+h_idx, grad_out_value * (v[idx1*h*hdim+h_idx*hdim+c_idx]/3 + table_val)); + atomicAdd(grad_v+idx1*h*hdim+h_idx*hdim+c_idx, grad_out_value * attn[m_idx*h+h_idx]/3); + atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * attn[m_idx*h+h_idx]); +} + +void attention_step2_with_rel_pos_value_forward_cuda_launcher(int N, int M, int h, int hdim, const float *attn, const float *v, const int *index0, + const int *index1, const float *table, const int *rel_idx, float *output) { + // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); + dim3 threads(THREADS_PER_BLOCK); + attention_step2_with_rel_pos_value_forward_cuda_kernel<<>>(N, M, h, hdim, attn, v, index0, index1, table, rel_idx, output); +} + +void attention_step2_with_rel_pos_value_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const int *index0, + const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); + dim3 threads(THREADS_PER_BLOCK); + attention_step2_with_rel_pos_value_backward_cuda_kernel<<>>(N, M, h, hdim, grad_out, index0, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.h b/models/Mask3D/mask3d/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..cafc7b69152fff9c0c440a093346fb6005923db0 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/rpe/relative_pos_encoding_cuda_kernel.h @@ -0,0 +1,26 @@ +#ifndef _RPE_CUDA_KERNEL +#define _RPE_CUDA_KERNEL +#include +#include +#include + +void dot_prod_with_idx_forward_cuda(int N, int M, int h, int hdim, at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor); +void dot_prod_with_idx_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, at::Tensor grad_table_tensor); + +void attention_step2_with_rel_pos_value_forward_cuda(int N, int M, int h, int hdim, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor); +void attention_step2_with_rel_pos_value_backward_cuda(int N, int M, int h, int hdim, at::Tensor grad_out_tensor, at::Tensor index0_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void dot_prod_with_idx_forward_cuda_launcher(int N, int M, int h, int hdim, const float *q, const int *index, const float *table, const int *rel_idx, float *output); +void dot_prod_with_idx_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const float *q, const int *index, const float *table, const int *rel_idx, float *grad_q, float *grad_table); + +void attention_step2_with_rel_pos_value_forward_cuda_launcher(int N, int M, int h, int hdim, const float *attn, const float *v, const int *index0, const int *index1, const float *table, const int *rel_idx, float *output); +void attention_step2_with_rel_pos_value_backward_cuda_launcher(int N, int M, int h, int hdim, const float *grad_out, const int *index0, const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu b/models/Mask3D/mask3d/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu new file mode 100644 index 0000000000000000000000000000000000000000..628d8e3ab9679ac14fc89872595927c6f997198f --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.cu @@ -0,0 +1,525 @@ +#include "../cuda_utils.h" +#include "relative_pos_encoding_cuda_kernel_v2.h" + + +// N, M, h, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, output + +template +__global__ void dot_prod_with_idx_forward_cuda_kernel_v2( // M, h, hdim + int N, int M, int h, const float *q, const int *index_q, const float *k, const int *index_k, + const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, + const int *sort_indices, float *output) { + // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3), output: (M, h) + + int h_idx = blockIdx.y; + int t_idx = blockIdx.x; + int n_idx = threadIdx.x; + int C = h*d; + + __shared__ int start, end; + if(n_idx == 0){ + start = rel_idx_offsets[t_idx]; + end = rel_idx_offsets[t_idx+1]; + // printf("e2: start: %d, end: %d\n", start, end); + } + + __syncthreads(); + + int m_idx_prev = start + n_idx; + // if(m_idx_prev >= end) + // return; + + __shared__ int m_idx; + if(n_idx == 0) + m_idx = sort_indices[m_idx_prev]; + + __syncthreads(); + + __shared__ int rel_idx_vec[3]; + if(n_idx < 3) + rel_idx_vec[n_idx] = rel_idx[m_idx*3 + n_idx]; + + __syncthreads(); + + __shared__ float table_q_vec[d]; + __shared__ float table_k_vec[d]; + + for(int i = n_idx; i < 2*d; i += blockDim.x){ + if (i < d){ + int ind0 = rel_idx_vec[0] * C * 3 + h_idx * d * 3 + i * 3 + 0; + int ind1 = rel_idx_vec[1] * C * 3 + h_idx * d * 3 + i * 3 + 1; + int ind2 = rel_idx_vec[2] * C * 3 + h_idx * d * 3 + i * 3 + 2; + table_q_vec[i] = table_q[ind0] + table_q[ind1] + table_q[ind2]; + } else{ + int ind0 = rel_idx_vec[0] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 0; + int ind1 = rel_idx_vec[1] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 1; + int ind2 = rel_idx_vec[2] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 2; + table_k_vec[i-d] = table_k[ind0] + table_k[ind1] + table_k[ind2]; + } + } + + __syncthreads(); + + for(int i = m_idx_prev; i < end; i += blockDim.x){ + float sum = 0; + int m_idx_i = sort_indices[i]; + int q_idx = index_q[m_idx_i]; + int k_idx = index_k[m_idx_i]; + for(int j = 0; j < d; j++){ + sum += q[q_idx*C + h_idx*d + j] * table_q_vec[j]; + sum += k[k_idx*C + h_idx*d + j] * table_k_vec[j]; + } + output[m_idx_i*h + h_idx] = sum; + } +} + +// N, M, h, hdim, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k + +template +__global__ void dot_prod_with_idx_backward_cuda_kernel_v2( // M, h, hdim + int N, int M, int h, const float *grad_out, const float *q, const int *index_q, + const float *k, const int *index_k, const float *table_q, const float *table_k, + const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *grad_q, + float *grad_k, float *grad_table_q, float *grad_table_k) { + + int h_idx = blockIdx.y; + int t_idx = blockIdx.x; + int n_idx = threadIdx.x; + int C = h*d; + + __shared__ int start, end; + if(n_idx == 0){ + start = rel_idx_offsets[t_idx]; + end = rel_idx_offsets[t_idx+1]; + } + + __syncthreads(); + + int m_idx_prev = start + n_idx; + // if(m_idx_prev >= end) + // return; + + __shared__ int m_idx; + if(n_idx == 0) + m_idx = sort_indices[m_idx_prev]; + + __syncthreads(); + + __shared__ int rel_idx_vec[3]; + if(n_idx < 3) + rel_idx_vec[n_idx] = rel_idx[m_idx*3 + n_idx]; + + __syncthreads(); + + __shared__ float table_q_vec[d]; + __shared__ float table_k_vec[d]; + + for(int i = n_idx; i < 2*d; i += blockDim.x){ + if (i < d){ + int ind0 = rel_idx_vec[0] * C * 3 + h_idx * d * 3 + i * 3 + 0; + int ind1 = rel_idx_vec[1] * C * 3 + h_idx * d * 3 + i * 3 + 1; + int ind2 = rel_idx_vec[2] * C * 3 + h_idx * d * 3 + i * 3 + 2; + table_q_vec[i] = table_q[ind0] + table_q[ind1] + table_q[ind2]; + } else{ + int ind0 = rel_idx_vec[0] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 0; + int ind1 = rel_idx_vec[1] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 1; + int ind2 = rel_idx_vec[2] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 2; + table_k_vec[i-d] = table_k[ind0] + table_k[ind1] + table_k[ind2]; + } + } + + __shared__ float gradient_q[d]; + __shared__ float gradient_k[d]; + for(int i = n_idx; i < d; i += blockDim.x){ + gradient_q[i] = 0; + gradient_k[i] = 0; + } + + __syncthreads(); + + for(int i = m_idx_prev; i < end; i += blockDim.x){ + int m_idx_i = sort_indices[i]; + int q_idx = index_q[m_idx_i]; + int k_idx = index_k[m_idx_i]; + float grad_out_i = grad_out[m_idx_i*h+h_idx]; + for(int j = 0; j < d; j++){ + atomicAdd(&gradient_q[j], q[q_idx*C + h_idx*d + j] * grad_out_i); + atomicAdd(&gradient_k[j], k[k_idx*C + h_idx*d + j] * grad_out_i); + atomicAdd(grad_q + q_idx*C + h_idx*d + j, table_q_vec[j] * grad_out_i); + atomicAdd(grad_k + k_idx*C + h_idx*d + j, table_k_vec[j] * grad_out_i); + } + } + + __syncthreads(); + + for(int i = n_idx; i < d*2; i += blockDim.x){ + if(i < d){ + atomicAdd(grad_table_q + rel_idx_vec[0] * C * 3 + h_idx * d * 3 + i * 3, gradient_q[i]); + atomicAdd(grad_table_q + rel_idx_vec[1] * C * 3 + h_idx * d * 3 + i * 3 + 1, gradient_q[i]); + atomicAdd(grad_table_q + rel_idx_vec[2] * C * 3 + h_idx * d * 3 + i * 3 + 2, gradient_q[i]); + }else{ + atomicAdd(grad_table_k + rel_idx_vec[0] * C * 3 + h_idx * d * 3 + (i-d) * 3, gradient_k[i-d]); + atomicAdd(grad_table_k + rel_idx_vec[1] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 1, gradient_k[i-d]); + atomicAdd(grad_table_k + rel_idx_vec[2] * C * 3 + h_idx * d * 3 + (i-d) * 3 + 2, gradient_k[i-d]); + } + } + + // int c_idx = blockIdx.z; + // int h_idx = blockIdx.y; + // int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; + // if (thread_idx >= M*3 || h_idx >= h || c_idx >= hdim) return; + + // int dim = thread_idx % 3; + // int m_idx = thread_idx / 3; + + // int q_idx = index[m_idx]; + // int rel_idx_dim = rel_idx[thread_idx]; + // int grad_out_idx = m_idx*h+h_idx; + // float grad_out_value = grad_out[grad_out_idx]; + + // float rel_table_val = table[rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim]; + // atomicAdd(grad_q+q_idx*h*hdim+h_idx*hdim+c_idx, grad_out_value * rel_table_val); + + // float q_value = q[q_idx*h*hdim+h_idx*hdim+c_idx]; + // atomicAdd(grad_table+rel_idx_dim*h*hdim*3+h_idx*hdim*3+c_idx*3+dim, grad_out_value * q_value); +} + +void dot_prod_with_idx_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *q, + const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, + const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *output) +{ + // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + dim3 blocks(T, h); + // dim3 threads(THREADS_PER_BLOCK); + + unsigned int n_threads = opt_n_threads(n_max); + n_threads = n_threads == n_max ? n_threads : n_threads * 2; + n_threads = n_threads > 1024 ? 512 : n_threads; + + // printf("e1: T: %d, h: %d, n_threads: %d\n", T, h, n_threads); + + switch (hdim) { + case 16: + dot_prod_with_idx_forward_cuda_kernel_v2<16><<>>(N, M, h, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, output); + break; + case 32: + dot_prod_with_idx_forward_cuda_kernel_v2<32><<>>(N, M, h, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, output); + break; + default: + throw "d != 16 and d != 32"; + } +} + +void dot_prod_with_idx_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, + const float *grad_out, const float *q, const int *index_q, const float *k, const int *index_k, + const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, + float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k) +{ + // input: grad_out: (M, h), output: grad_q: (N, h, hdim), grad_table: (L, h, hdim, 3) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + // dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); + // dim3 threads(THREADS_PER_BLOCK); + + dim3 blocks(T, h); + // dim3 threads(THREADS_PER_BLOCK); + + unsigned int n_threads = opt_n_threads(n_max); + n_threads = n_threads == n_max ? n_threads : n_threads * 2; + n_threads = n_threads > 1024 ? 512 : n_threads; + + switch (hdim) { + case 16: + dot_prod_with_idx_backward_cuda_kernel_v2<16><<>>(N, M, h, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k); + break; + case 32: + dot_prod_with_idx_backward_cuda_kernel_v2<32><<>>(N, M, h, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k); + break; + default: + throw "d != 16 and d != 32"; + } +} + + + +template +__global__ void dot_prod_with_idx_forward_cuda_kernel_v3( // M, h, hdim + int N, int M, int h, const float *q, const int *index_q_offsets, const float *k, const int *index_k, + const float *table_q, const float *table_k, const int *rel_idx, float *output) { + // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3), output: (M, h) + int q_idx = blockIdx.x; + int h_idx = blockIdx.y; + int n_idx = threadIdx.x; + int C = h*d; + + __shared__ float query_vec[d]; + __shared__ int start, end; + if (n_idx == 0){ + start = index_q_offsets[q_idx]; + end = index_q_offsets[q_idx+1]; + } + for(int i = n_idx; i < d; i += blockDim.x) + query_vec[i] = q[q_idx*C + h_idx*d + i]; + + __syncthreads(); + + int m_idx = start + n_idx; + if(m_idx >= end) + return; + + int k_idx = index_k[m_idx]; + int r_idx1 = rel_idx[m_idx*3], r_idx2 = rel_idx[m_idx*3+1], r_idx3 = rel_idx[m_idx*3+2]; + float sum = 0; + for(int i = 0; i < d; i++){ + float table_q_scalar_i = table_q[r_idx1*C*3+h_idx*d*3+i*3] + table_q[r_idx2*C*3+h_idx*d*3+i*3+1] + table_q[r_idx3*C*3+h_idx*d*3+i*3+2]; + sum += query_vec[i] * table_q_scalar_i; + float table_k_scalar_i = table_k[r_idx1*C*3+h_idx*d*3+i*3] + table_k[r_idx2*C*3+h_idx*d*3+i*3+1] + table_k[r_idx3*C*3+h_idx*d*3+i*3+2]; + sum += k[k_idx*C+h_idx*d+i] * table_k_scalar_i; + } + output[m_idx*h + h_idx] = sum; + +} + +// N, M, h, hdim, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k + +template +__global__ void dot_prod_with_idx_backward_cuda_kernel_v3( // M, h, hdim + int N, int M, int h, const float *grad_out, const float *q, const int *index_q_offsets, + const float *k, const int *index_k, const float *table_q, const float *table_k, + const int *rel_idx, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k) { + + int q_idx = blockIdx.x; + int h_idx = blockIdx.y; + int n_idx = threadIdx.x; + int C = h*d; + + __shared__ float query_vec[d]; + __shared__ int start, end; + if (n_idx == 0){ + start = index_q_offsets[q_idx]; + end = index_q_offsets[q_idx+1]; + } + for(int i = n_idx; i < d; i += blockDim.x) + query_vec[i] = q[q_idx*C + h_idx*d + i]; + + __shared__ float gradients_q[d]; + for(int i = n_idx; i < d; i += blockDim.x){ + gradients_q[i] = 0; + } + + __syncthreads(); + + int m_idx = start + n_idx; + + if(m_idx < end){ + int k_idx = index_k[m_idx]; + int r_idx1 = rel_idx[m_idx*3], r_idx2 = rel_idx[m_idx*3+1], r_idx3 = rel_idx[m_idx*3+2]; + float gradient = grad_out[m_idx*h + h_idx]; + for(int i = 0; i < d; i++){ + float table_q_scalar_i = table_q[r_idx1*C*3+h_idx*d*3+i*3] + table_q[r_idx2*C*3+h_idx*d*3+i*3+1] + table_q[r_idx3*C*3+h_idx*d*3+i*3+2]; + float table_k_scalar_i = table_k[r_idx1*C*3+h_idx*d*3+i*3] + table_k[r_idx2*C*3+h_idx*d*3+i*3+1] + table_k[r_idx3*C*3+h_idx*d*3+i*3+2]; + float q_scalar_i = query_vec[i]; + float k_scalar_i = k[k_idx*C+h_idx*d+i]; + atomicAdd(&gradients_q[i], table_q_scalar_i * gradient); + atomicAdd(grad_k+k_idx*C+h_idx*d+i, table_k_scalar_i * gradient); + atomicAdd(grad_table_q+r_idx1*C*3+h_idx*d*3+i*3, q_scalar_i * gradient); + atomicAdd(grad_table_q+r_idx2*C*3+h_idx*d*3+i*3+1, q_scalar_i * gradient); + atomicAdd(grad_table_q+r_idx3*C*3+h_idx*d*3+i*3+2, q_scalar_i * gradient); + atomicAdd(grad_table_k+r_idx1*C*3+h_idx*d*3+i*3, k_scalar_i * gradient); + atomicAdd(grad_table_k+r_idx2*C*3+h_idx*d*3+i*3+1, k_scalar_i * gradient); + atomicAdd(grad_table_k+r_idx3*C*3+h_idx*d*3+i*3+2, k_scalar_i * gradient); + } + } + __syncthreads(); + + for(int i = n_idx; i < d; i += blockDim.x){ + grad_q[q_idx*C+h_idx*d+i] = gradients_q[i]; + } +} + +void dot_prod_with_idx_forward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *q, + const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, + const int *rel_idx, float *output) +{ + // input: q: (N, h, hdim), index: (M), table: (L, h, hdim, 3), rel_idx: (M, 3) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + dim3 blocks(N, h); + // dim3 threads(THREADS_PER_BLOCK); + + unsigned int n_threads = opt_n_threads(n_max); + n_threads = n_threads == n_max ? n_threads : n_threads * 2; + + // printf("e1: h: %d, n_max: %d, n_threads: %d\n", h, n_max, n_threads); + + switch (hdim) { + case 16: + dot_prod_with_idx_forward_cuda_kernel_v3<16><<>>(N, M, h, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, output); + break; + case 32: + dot_prod_with_idx_forward_cuda_kernel_v3<32><<>>(N, M, h, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, output); + break; + default: + throw "d != 16 and d != 32"; + } +} + +void dot_prod_with_idx_backward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, + const float *grad_out, const float *q, const int *index_q_offsets, const float *k, const int *index_k, + const float *table_q, const float *table_k, const int *rel_idx, + float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k) +{ + // input: grad_out: (M, h), output: grad_q: (N, h, hdim), grad_table: (L, h, hdim, 3) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + // dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); + // dim3 threads(THREADS_PER_BLOCK); + + dim3 blocks(N, h); + // dim3 threads(THREADS_PER_BLOCK); + + unsigned int n_threads = opt_n_threads(n_max); + n_threads = n_threads == n_max ? n_threads : n_threads * 2; + + switch (hdim) { + case 16: + dot_prod_with_idx_backward_cuda_kernel_v3<16><<>>(N, M, h, grad_out, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, grad_q, grad_k, grad_table_q, grad_table_k); + break; + case 32: + dot_prod_with_idx_backward_cuda_kernel_v3<32><<>>(N, M, h, grad_out, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, grad_q, grad_k, grad_table_q, grad_table_k); + break; + default: + throw "d != 16 and d != 32"; + } +} + + +template +__global__ void attention_step2_with_rel_pos_value_forward_cuda_kernel_v2( // M, h, hdim + int N, int M, int h, const float *attn, const float *v, + const int *index0_offsets, const int *index1, const float *table, const int *rel_idx, float *output) { + // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3) + + int q_idx = blockIdx.x; + int h_idx = blockIdx.y; + int n_idx = threadIdx.x; + + int C = h*d; + + __shared__ int start, end; + __shared__ float result[d]; + + if (n_idx == 0){ + start = index0_offsets[q_idx]; + end = index0_offsets[q_idx+1]; + } + for (int i = n_idx; i < d; i += blockDim.x){ + result[i] = 0; + } + + __syncthreads(); + + int m_idx = start + n_idx; + if (m_idx < end){ + float attn_scalar = attn[m_idx*h + h_idx]; + int r_idx1 = rel_idx[m_idx*3], r_idx2 = rel_idx[m_idx*3+1], r_idx3 = rel_idx[m_idx*3+2]; + for(int i = 0; i < d; i ++){ + int v_idx = index1[m_idx]; + float table_scaler_i = table[r_idx1*C*3+h_idx*d*3+i*3] + table[r_idx2*C*3+h_idx*d*3+i*3+1] + table[r_idx3*C*3+h_idx*d*3+i*3+2]; + float value_scaler_i = v[v_idx*C + h_idx*d + i]; + atomicAdd(&result[i], (table_scaler_i + value_scaler_i) * attn_scalar); + } + } + + __syncthreads(); + + for (int i = n_idx; i < d; i += blockDim.x) + output[q_idx*C + h_idx*d + i] = result[i]; +} + + +template +__global__ void attention_step2_with_rel_pos_value_backward_cuda_kernel_v2( // M, h, hdim + int N, int M, int h, const float *grad_out, const int *index0_offsets, const int *index1, const float *attn, const float *v, const float *table, + const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) { + // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3) + + int q_idx = blockIdx.x; + int h_idx = blockIdx.y; + int n_idx = threadIdx.x; + + int C = h*d; + + __shared__ int start, end; + __shared__ float gradients[d]; + + if (n_idx == 0){ + start = index0_offsets[q_idx]; + end = index0_offsets[q_idx+1]; + } + for (int i = n_idx; i < d; i += blockDim.x){ + gradients[i] = grad_out[q_idx*C + h_idx*d + i]; + } + + __syncthreads(); + + int m_idx = start + n_idx; + if (m_idx < end){ + int v_idx = index1[m_idx]; + int r_idx1 = rel_idx[m_idx*3], r_idx2 = rel_idx[m_idx*3+1], r_idx3 = rel_idx[m_idx*3+2]; + float attn_scalar = attn[m_idx*h + h_idx]; + float grad_attn_sum = 0; + for (int i = 0; i < d; i++){ + float grad_out_scaler_i = gradients[i]; + float table_scaler_i = table[r_idx1*C*3+h_idx*d*3+i*3] + table[r_idx2*C*3+h_idx*d*3+i*3+1] + table[r_idx3*C*3+h_idx*d*3+i*3+2]; + float value_scaler_i = v[v_idx*C + h_idx*d + i]; + grad_attn_sum += (table_scaler_i + value_scaler_i) * grad_out_scaler_i; + atomicAdd(grad_v + v_idx*C + h_idx*d + i, attn_scalar * grad_out_scaler_i); + atomicAdd(grad_table + r_idx1*C*3 + h_idx*d*3 + i*3, attn_scalar * grad_out_scaler_i); + atomicAdd(grad_table + r_idx2*C*3 + h_idx*d*3 + i*3 + 1, attn_scalar * grad_out_scaler_i); + atomicAdd(grad_table + r_idx3*C*3 + h_idx*d*3 + i*3 + 2, attn_scalar * grad_out_scaler_i); + } + grad_attn[m_idx*h + h_idx] = grad_attn_sum; + } +} + +void attention_step2_with_rel_pos_value_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *attn, const float *v, const int *index0_offsets, + const int *index1, const float *table, const int *rel_idx, float *output) { + // input: attn: (M, h), v: (N, h, hdim), index0: (M, ), index1: (M, ), table: (L, h, hdim, 3), rel_idx: (M, 3) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + // dim3 blocks(DIVUP(M*3, THREADS_PER_BLOCK), h, hdim); + // dim3 threads(THREADS_PER_BLOCK); + dim3 blocks(N, h); + unsigned int n_threads = opt_n_threads(n_max); + n_threads = n_threads == n_max ? n_threads : n_threads * 2; + + switch (hdim) { + case 16: + attention_step2_with_rel_pos_value_forward_cuda_kernel_v2<16><<>>(N, M, h, attn, v, index0_offsets, index1, table, rel_idx, output); + break; + case 32: + attention_step2_with_rel_pos_value_forward_cuda_kernel_v2<32><<>>(N, M, h, attn, v, index0_offsets, index1, table, rel_idx, output); + break; + default: + throw "d != 16 and d != 32"; + } +} + +void attention_step2_with_rel_pos_value_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *grad_out, const int *index0_offsets, + const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + //dim3 blocks(DIVUP(hdim, THREADS_PER_BLOCK), h, M); + + dim3 blocks(N, h); + unsigned int n_threads = opt_n_threads(n_max); + n_threads = n_threads == n_max ? n_threads : n_threads * 2; + + switch (hdim) { + case 16: + attention_step2_with_rel_pos_value_backward_cuda_kernel_v2<16><<>>(N, M, h, grad_out, index0_offsets, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table); + break; + case 32: + attention_step2_with_rel_pos_value_backward_cuda_kernel_v2<32><<>>(N, M, h, grad_out, index0_offsets, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table); + break; + default: + throw "d != 16 and d != 32"; + } +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.h b/models/Mask3D/mask3d/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.h new file mode 100644 index 0000000000000000000000000000000000000000..648b152afe16d3011b62ff141a4e20b2a83579b4 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_kernel_v2.h @@ -0,0 +1,32 @@ +#ifndef _RPE_V2_CUDA_KERNEL +#define _RPE_V2_CUDA_KERNEL +#include +#include +#include + +void dot_prod_with_idx_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor output_tensor); +void dot_prod_with_idx_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor); + +void dot_prod_with_idx_forward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor); +void dot_prod_with_idx_backward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor); + +void attention_step2_with_rel_pos_value_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor); +void attention_step2_with_rel_pos_value_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void dot_prod_with_idx_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *q, const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *output); +void dot_prod_with_idx_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, int T, const float *grad_out, const float *q, const int *index_q, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, const int *rel_idx_offsets, const int *sort_indices, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k); + +void dot_prod_with_idx_forward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *q, const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, float *output); +void dot_prod_with_idx_backward_cuda_launcher_v3(int N, int M, int h, int hdim, int n_max, const float *grad_out, const float *q, const int *index_q_offsets, const float *k, const int *index_k, const float *table_q, const float *table_k, const int *rel_idx, float *grad_q, float *grad_k, float *grad_table_q, float *grad_table_k); + +void attention_step2_with_rel_pos_value_forward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *attn, const float *v, const int *index0_offsets, const int *index1, const float *table, const int *rel_idx, float *output); +void attention_step2_with_rel_pos_value_backward_cuda_launcher_v2(int N, int M, int h, int hdim, int n_max, const float *grad_out, const int *index0_offsets, const int *index1, const float *attn, const float *v, const float *table, const int *rel_idx, float *grad_attn, float *grad_v, float *grad_table); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_v2.cpp b/models/Mask3D/mask3d/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_v2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a4c96a8688536d19611a57a2017ae1ba44f12bf --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/rpe_v2/relative_pos_encoding_cuda_v2.cpp @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include "relative_pos_encoding_cuda_kernel_v2.h" + +void dot_prod_with_idx_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor q_tensor, + at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, + at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, at::Tensor sort_indices_tensor, at::Tensor output_tensor) +{ + const float *q = q_tensor.data_ptr(); + const int *index_q = index_q_tensor.data_ptr(); + const float *k = k_tensor.data_ptr(); + const int *index_k = index_k_tensor.data_ptr(); + const float *table_q = table_q_tensor.data_ptr(); + const float *table_k = table_k_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + const int *rel_idx_offsets = rel_idx_offsets_tensor.data_ptr(); + const int *sort_indices = sort_indices_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + dot_prod_with_idx_forward_cuda_launcher_v2(N, M, h, hdim, n_max, T, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, output); +} + +void dot_prod_with_idx_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, int T, at::Tensor grad_out_tensor, + at::Tensor q_tensor, at::Tensor index_q_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, + at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor rel_idx_offsets_tensor, + at::Tensor sort_indices_tensor, at::Tensor grad_q_tensor, at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor) +{ + const float *grad_out = grad_out_tensor.data_ptr(); + const float *q = q_tensor.data_ptr(); + const int *index_q = index_q_tensor.data_ptr(); + const float *k = k_tensor.data_ptr(); + const int *index_k = index_k_tensor.data_ptr(); + const float *table_q = table_q_tensor.data_ptr(); + const float *table_k = table_k_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + const int *rel_idx_offsets = rel_idx_offsets_tensor.data_ptr(); + const int *sort_indices = sort_indices_tensor.data_ptr(); + float *grad_q = grad_q_tensor.data_ptr(); + float *grad_k = grad_k_tensor.data_ptr(); + float *grad_table_q = grad_table_q_tensor.data_ptr(); + float *grad_table_k = grad_table_k_tensor.data_ptr(); + dot_prod_with_idx_backward_cuda_launcher_v2(N, M, h, hdim, n_max, T, grad_out, q, index_q, k, index_k, table_q, table_k, rel_idx, rel_idx_offsets, sort_indices, grad_q, grad_k, grad_table_q, grad_table_k); +} + + +void dot_prod_with_idx_forward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor q_tensor, + at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, at::Tensor table_q_tensor, + at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor) +{ + const float *q = q_tensor.data_ptr(); + const int *index_q_offsets = index_q_offsets_tensor.data_ptr(); + const float *k = k_tensor.data_ptr(); + const int *index_k = index_k_tensor.data_ptr(); + const float *table_q = table_q_tensor.data_ptr(); + const float *table_k = table_k_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + dot_prod_with_idx_forward_cuda_launcher_v3(N, M, h, hdim, n_max, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, output); +} + +void dot_prod_with_idx_backward_cuda_v3(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, + at::Tensor q_tensor, at::Tensor index_q_offsets_tensor, at::Tensor k_tensor, at::Tensor index_k_tensor, + at::Tensor table_q_tensor, at::Tensor table_k_tensor, at::Tensor rel_idx_tensor, at::Tensor grad_q_tensor, + at::Tensor grad_k_tensor, at::Tensor grad_table_q_tensor, at::Tensor grad_table_k_tensor) +{ + const float *grad_out = grad_out_tensor.data_ptr(); + const float *q = q_tensor.data_ptr(); + const int *index_q_offsets = index_q_offsets_tensor.data_ptr(); + const float *k = k_tensor.data_ptr(); + const int *index_k = index_k_tensor.data_ptr(); + const float *table_q = table_q_tensor.data_ptr(); + const float *table_k = table_k_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + float *grad_q = grad_q_tensor.data_ptr(); + float *grad_k = grad_k_tensor.data_ptr(); + float *grad_table_q = grad_table_q_tensor.data_ptr(); + float *grad_table_k = grad_table_k_tensor.data_ptr(); + dot_prod_with_idx_backward_cuda_launcher_v3(N, M, h, hdim, n_max, grad_out, q, index_q_offsets, k, index_k, table_q, table_k, rel_idx, grad_q, grad_k, grad_table_q, grad_table_k); +} + + +void attention_step2_with_rel_pos_value_forward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor attn_tensor, at::Tensor v_tensor, + at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor table_tensor, at::Tensor rel_idx_tensor, at::Tensor output_tensor) +{ + const float *attn = attn_tensor.data_ptr(); + const float *v = v_tensor.data_ptr(); + const int *index0_offsets = index0_offsets_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + const float *table = table_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + attention_step2_with_rel_pos_value_forward_cuda_launcher_v2(N, M, h, hdim, n_max, attn, v, index0_offsets, index1, table, rel_idx, output); +} + +void attention_step2_with_rel_pos_value_backward_cuda_v2(int N, int M, int h, int hdim, int n_max, at::Tensor grad_out_tensor, + at::Tensor index0_offsets_tensor, at::Tensor index1_tensor, at::Tensor attn_tensor, at::Tensor v_tensor, at::Tensor table_tensor, + at::Tensor rel_idx_tensor, at::Tensor grad_attn_tensor, at::Tensor grad_v_tensor, at::Tensor grad_table_tensor) +{ + const float *grad_out = grad_out_tensor.data_ptr(); + const int *index0_offsets = index0_offsets_tensor.data_ptr(); + const int *index1 = index1_tensor.data_ptr(); + const float *attn = attn_tensor.data_ptr(); + const float *v = v_tensor.data_ptr(); + const float *table = table_tensor.data_ptr(); + const int *rel_idx = rel_idx_tensor.data_ptr(); + float *grad_attn = grad_attn_tensor.data_ptr(); + float *grad_v = grad_v_tensor.data_ptr(); + float *grad_table = grad_table_tensor.data_ptr(); + attention_step2_with_rel_pos_value_backward_cuda_launcher_v2(N, M, h, hdim, n_max, grad_out, index0_offsets, index1, attn, v, table, rel_idx, grad_attn, grad_v, grad_table); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/sampling/sampling_cuda.cpp b/models/Mask3D/mask3d/utils/pointops2/src/sampling/sampling_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7b2622ae6c4ed8c2f361a555d1c4b5b9ee6a2db7 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/sampling/sampling_cuda.cpp @@ -0,0 +1,16 @@ +#include +#include +#include +#include +#include "sampling_cuda_kernel.h" + + +void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor) +{ + const float *xyz = xyz_tensor.data_ptr(); + const int *offset = offset_tensor.data_ptr(); + const int *new_offset = new_offset_tensor.data_ptr(); + float *tmp = tmp_tensor.data_ptr(); + int *idx = idx_tensor.data_ptr(); + furthestsampling_cuda_launcher(b, n, xyz, offset, new_offset, tmp, idx); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/sampling/sampling_cuda_kernel.cu b/models/Mask3D/mask3d/utils/pointops2/src/sampling/sampling_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..d2c70b5c9e6374e4b52fa9f9327d6cae9337d17e --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/sampling/sampling_cuda_kernel.cu @@ -0,0 +1,171 @@ +#include "../cuda_utils.h" +#include "sampling_cuda_kernel.h" + + +__device__ void __update(float *dists, int *dists_i, int idx1, int idx2) { + const float v1 = dists[idx1], v2 = dists[idx2]; + const int i1 = dists_i[idx1], i2 = dists_i[idx2]; + dists[idx1] = max(v1, v2); + dists_i[idx1] = v2 > v1 ? i2 : i1; +} + +// input xyz: (n, 3), tmp: (b, n_max) +// ouput idx (m) +template +__global__ void furthestsampling_cuda_kernel(const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx) +{ + __shared__ float dists[block_size]; + __shared__ int dists_i[block_size]; + + int bid = blockIdx.x; + int start_n, end_n, start_m, end_m, old; + if (bid == 0) { + start_n = 0; + end_n = offset[0]; + start_m = 0; + end_m = new_offset[0]; + old = 0; + } + else { + start_n = offset[bid - 1]; + end_n = offset[bid]; + start_m = new_offset[bid - 1]; + end_m = new_offset[bid]; + old = offset[bid - 1]; + } + + const int stride = block_size; + int tid = threadIdx.x; + if (tid == 0) idx[start_m] = start_n; + + __syncthreads(); + for (int j = start_m + 1; j < end_m; j++) + { + int besti = start_n; + float best = -1; + float x1 = xyz[old * 3 + 0]; + float y1 = xyz[old * 3 + 1]; + float z1 = xyz[old * 3 + 2]; + for (int k = start_n + tid; k < end_n; k += stride) + { + float x2 = xyz[k * 3 + 0]; + float y2 = xyz[k * 3 + 1]; + float z2 = xyz[k * 3 + 2]; + float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); + float d2 = min(d, tmp[k]); + tmp[k] = d2; + besti = d2 > best ? k : besti; + best = d2 > best ? d2 : best; + } + dists[tid] = best; + dists_i[tid] = besti; + __syncthreads(); + + if (block_size >= 1024) { + if (tid < 512) { + __update(dists, dists_i, tid, tid + 512); + } + __syncthreads(); + } + if (block_size >= 512) { + if (tid < 256) { + __update(dists, dists_i, tid, tid + 256); + } + __syncthreads(); + } + if (block_size >= 256) { + if (tid < 128) { + __update(dists, dists_i, tid, tid + 128); + } + __syncthreads(); + } + if (block_size >= 128) { + if (tid < 64) { + __update(dists, dists_i, tid, tid + 64); + } + __syncthreads(); + } + if (block_size >= 64) { + if (tid < 32) { + __update(dists, dists_i, tid, tid + 32); + } + __syncthreads(); + } + if (block_size >= 32) { + if (tid < 16) { + __update(dists, dists_i, tid, tid + 16); + } + __syncthreads(); + } + if (block_size >= 16) { + if (tid < 8) { + __update(dists, dists_i, tid, tid + 8); + } + __syncthreads(); + } + if (block_size >= 8) { + if (tid < 4) { + __update(dists, dists_i, tid, tid + 4); + } + __syncthreads(); + } + if (block_size >= 4) { + if (tid < 2) { + __update(dists, dists_i, tid, tid + 2); + } + __syncthreads(); + } + if (block_size >= 2) { + if (tid < 1) { + __update(dists, dists_i, tid, tid + 1); + } + __syncthreads(); + } + + old = dists_i[0]; + if (tid == 0) + idx[j] = old; + } +} + +void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx) +{ + unsigned int n_threads = opt_n_threads(n); + switch (n_threads) { + case 1024: + furthestsampling_cuda_kernel<1024><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 512: + furthestsampling_cuda_kernel<512><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 256: + furthestsampling_cuda_kernel<256><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 128: + furthestsampling_cuda_kernel<128><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 64: + furthestsampling_cuda_kernel<64><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 32: + furthestsampling_cuda_kernel<32><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 16: + furthestsampling_cuda_kernel<16><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 8: + furthestsampling_cuda_kernel<8><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 4: + furthestsampling_cuda_kernel<4><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 2: + furthestsampling_cuda_kernel<2><<>>(xyz, offset, new_offset, tmp, idx); + break; + case 1: + furthestsampling_cuda_kernel<1><<>>(xyz, offset, new_offset, tmp, idx); + break; + default: + furthestsampling_cuda_kernel<512><<>>(xyz, offset, new_offset, tmp, idx); + } +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/sampling/sampling_cuda_kernel.h b/models/Mask3D/mask3d/utils/pointops2/src/sampling/sampling_cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c903f638eb30bbf5bf01141ed2740cc0cd37452e --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/sampling/sampling_cuda_kernel.h @@ -0,0 +1,18 @@ +#ifndef _SAMPLING_CUDA_KERNEL +#define _SAMPLING_CUDA_KERNEL +#include +#include +#include + +void furthestsampling_cuda(int b, int n, at::Tensor xyz_tensor, at::Tensor offset_tensor, at::Tensor new_offset_tensor, at::Tensor tmp_tensor, at::Tensor idx_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void furthestsampling_cuda_launcher(int b, int n, const float *xyz, const int *offset, const int *new_offset, float *tmp, int *idx); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/pointops2/src/subtraction/subtraction_cuda.cpp b/models/Mask3D/mask3d/utils/pointops2/src/subtraction/subtraction_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fa38dc5697312dfe6111931f2d6abcde7c8f0e77 --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/subtraction/subtraction_cuda.cpp @@ -0,0 +1,24 @@ +#include +#include +#include +#include +#include "subtraction_cuda_kernel.h" + + +void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor) +{ + const float *input1 = input1_tensor.data_ptr(); + const float *input2 = input2_tensor.data_ptr(); + const int *idx = idx_tensor.data_ptr(); + float *output = output_tensor.data_ptr(); + subtraction_forward_cuda_launcher(n, nsample, c, input1, input2, idx, output); +} + +void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor) +{ + const int *idx = idx_tensor.data_ptr(); + const float *grad_output = grad_output_tensor.data_ptr(); + float *grad_input1 = grad_input1_tensor.data_ptr(); + float *grad_input2 = grad_input2_tensor.data_ptr(); + subtraction_backward_cuda_launcher(n, nsample, c, idx, grad_output, grad_input1, grad_input2); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/subtraction/subtraction_cuda_kernel.cu b/models/Mask3D/mask3d/utils/pointops2/src/subtraction/subtraction_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..9b8d4f752940d580ee2b49f1b2946a8d6386d11a --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/subtraction/subtraction_cuda_kernel.cu @@ -0,0 +1,44 @@ +#include "../cuda_utils.h" +#include "subtraction_cuda_kernel.h" + + +__global__ void subtraction_forward_cuda_kernel(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) { + // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * nsample * c) return; + const int c_idx = index % c; + const int nsample_idx = (index / c) % nsample; + const int n_idx = index / nsample / c; + const int idx_idx = n_idx * nsample + nsample_idx; + const int input1_idx = n_idx * c + c_idx; + const int input2_idx = idx[idx_idx] * c + c_idx; + output[index] = input1[input1_idx] - input2[input2_idx]; +} + +__global__ void subtraction_backward_cuda_kernel(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n * nsample * c) return; + const int c_idx = index % c; + const int nsample_idx = (index / c) % nsample; + const int n_idx = index / nsample / c; + const int idx_idx = n_idx * nsample + nsample_idx; + const int input1_idx = n_idx * c + c_idx; + const int input2_idx = idx[idx_idx] * c + c_idx; + atomicAdd(grad_input1 + input1_idx, grad_output[index]); + atomicAdd(grad_input2 + input2_idx, -grad_output[index]); +} + +void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output) { + // input: input1: (n, c), input2: (n, c), idx: (n, nsample), output: (n, nsample, c) + dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + subtraction_forward_cuda_kernel<<>>(n, nsample, c, input1, input2, idx, output); +} + +void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2) { + // input: grad_output: (n, nsample, c), output: grad_input1: (n, c), grad_input2: (n, c) + dim3 blocks(DIVUP(n * nsample * c, THREADS_PER_BLOCK)); + dim3 threads(THREADS_PER_BLOCK); + subtraction_backward_cuda_kernel<<>>(n, nsample, c, idx, grad_output, grad_input1, grad_input2); +} diff --git a/models/Mask3D/mask3d/utils/pointops2/src/subtraction/subtraction_cuda_kernel.h b/models/Mask3D/mask3d/utils/pointops2/src/subtraction/subtraction_cuda_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..856133d97bdd3dc58f29c746ff240fc9d489c22e --- /dev/null +++ b/models/Mask3D/mask3d/utils/pointops2/src/subtraction/subtraction_cuda_kernel.h @@ -0,0 +1,20 @@ +#ifndef _SUBTRACTION_CUDA_KERNEL +#define _SUBTRACTION_CUDA_KERNEL +#include +#include +#include + +void subtraction_forward_cuda(int n, int nsample, int c, at::Tensor input1_tensor, at::Tensor input2_tensor, at::Tensor idx_tensor, at::Tensor output_tensor); +void subtraction_backward_cuda(int n, int nsample, int c, at::Tensor idx_tensor, at::Tensor grad_output_tensor, at::Tensor grad_input1_tensor, at::Tensor grad_input2_tensor); + +#ifdef __cplusplus +extern "C" { +#endif + +void subtraction_forward_cuda_launcher(int n, int nsample, int c, const float *input1, const float *input2, const int *idx, float *output); +void subtraction_backward_cuda_launcher(int n, int nsample, int c, const int *idx, const float *grad_output, float *grad_input1, float *grad_input2); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/models/Mask3D/mask3d/utils/utils.py b/models/Mask3D/mask3d/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b7bb374e072bc893a6b2e68aa390478cad63671a --- /dev/null +++ b/models/Mask3D/mask3d/utils/utils.py @@ -0,0 +1,133 @@ +import sys + +if sys.version_info[:2] >= (3, 8): + from collections.abc import MutableMapping +else: + from collections import MutableMapping + +import torch +from loguru import logger + + +def flatten_dict(d, parent_key="", sep="_"): + """ + https://stackoverflow.com/questions/6027558/flatten-nested-dictionaries-compressing-keys + """ + items = [] + for k, v in d.items(): + new_key = parent_key + sep + k if parent_key else k + if isinstance(v, MutableMapping): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + +def load_baseline_model(cfg, model): + # if it is Minkoski weights + cfg.model.in_channels = 3 + cfg.model.config.conv1_kernel_size = 5 + cfg.data.add_normals = False + cfg.data.train_dataset.color_mean_std = [(0.5, 0.5, 0.5), (1, 1, 1)] + cfg.data.validation_dataset.color_mean_std = [(0.5, 0.5, 0.5), (1, 1, 1)] + cfg.data.test_dataset.color_mean_std = [(0.5, 0.5, 0.5), (1, 1, 1)] + cfg.data.voxel_size = 0.02 + model = model(cfg) + state_dict = torch.load(cfg.general.checkpoint)["state_dict"] + model.model.load_state_dict(state_dict) + return cfg, model + + +def load_backbone_checkpoint_with_missing_or_exsessive_keys(cfg, model): + state_dict = torch.load(cfg.general.backbone_checkpoint)["state_dict"] + correct_dict = dict(model.state_dict()) + + # if parametrs not found in checkpoint they will be randomly initialized + for key in state_dict.keys(): + if correct_dict.pop(f"model.backbone.{key}", None) is None: + logger.warning( + f"Key not found, it will be initialized randomly: {key}" + ) + + # if parametrs have different shape, it will randomly initialize + state_dict = torch.load(cfg.general.backbone_checkpoint)["state_dict"] + correct_dict = dict(model.state_dict()) + for key in correct_dict.keys(): + if key.replace("model.backbone.", "") not in state_dict: + logger.warning(f"{key} not in loaded checkpoint") + state_dict.update( + {key.replace("model.backbone.", ""): correct_dict[key]} + ) + elif ( + state_dict[key.replace("model.backbone.", "")].shape + != correct_dict[key].shape + ): + logger.warning( + f"incorrect shape {key}:{state_dict[key.replace('model.backbone.', '')].shape} vs {correct_dict[key].shape}" + ) + state_dict.update({key: correct_dict[key]}) + + # if we have more keys just discard them + correct_dict = dict(model.state_dict()) + new_state_dict = dict() + for key in state_dict.keys(): + if f"model.backbone.{key}" in correct_dict.keys(): + new_state_dict.update({f"model.backbone.{key}": state_dict[key]}) + elif key in correct_dict.keys(): + new_state_dict.update({key: correct_dict[key]}) + else: + logger.warning(f"excessive key: {key}") + model.load_state_dict(new_state_dict) + return cfg, model + + +def load_checkpoint_with_missing_or_exsessive_keys(cfg, model): + state_dict = torch.load(cfg.general.checkpoint)["state_dict"] + correct_dict = dict(model.state_dict()) + + # if parametrs not found in checkpoint they will be randomly initialized + for key in state_dict.keys(): + if correct_dict.pop(key, None) is None: + logger.warning( + f"Key not found, it will be initialized randomly: {key}" + ) + + # if parametrs have different shape, it will randomly initialize + state_dict = torch.load(cfg.general.checkpoint)["state_dict"] + correct_dict = dict(model.state_dict()) + for key in correct_dict.keys(): + if key not in state_dict: + logger.warning(f"{key} not in loaded checkpoint") + state_dict.update({key: correct_dict[key]}) + elif state_dict[key].shape != correct_dict[key].shape: + logger.warning( + f"incorrect shape {key}:{state_dict[key].shape} vs {correct_dict[key].shape}" + ) + state_dict.update({key: correct_dict[key]}) + + # if we have more keys just discard them + correct_dict = dict(model.state_dict()) + new_state_dict = dict() + for key in state_dict.keys(): + if key in correct_dict.keys(): + new_state_dict.update({key: state_dict[key]}) + else: + logger.warning(f"excessive key: {key}") + model.load_state_dict(new_state_dict) + return cfg, model + + +def freeze_until(net, param_name: str = None): + """ + Freeze net until param_name + https://opendatascience.slack.com/archives/CGK4KQBHD/p1588373239292300?thread_ts=1588105223.275700&cid=CGK4KQBHD + Args: + net: + param_name: + Returns: + """ + found_name = False + for name, params in net.named_parameters(): + if name == param_name: + found_name = True + params.requires_grad = found_name diff --git a/models/Mask3D/mask3d/utils/votenet_utils/box_util.py b/models/Mask3D/mask3d/utils/votenet_utils/box_util.py new file mode 100644 index 0000000000000000000000000000000000000000..3c5a56e775ea5d44c9b425f3fab1ba353c452d56 --- /dev/null +++ b/models/Mask3D/mask3d/utils/votenet_utils/box_util.py @@ -0,0 +1,330 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" Helper functions for calculating 2D and 3D bounding box IoU. + +Collected and written by Charles R. Qi +Last modified: Jul 2019 +""" +from __future__ import print_function + +import numpy as np +from scipy.spatial import ConvexHull + + +def polygon_clip(subjectPolygon, clipPolygon): + """Clip a polygon with another polygon. + + Ref: https://rosettacode.org/wiki/Sutherland-Hodgman_polygon_clipping#Python + + Args: + subjectPolygon: a list of (x,y) 2d points, any polygon. + clipPolygon: a list of (x,y) 2d points, has to be *convex* + Note: + **points have to be counter-clockwise ordered** + + Return: + a list of (x,y) vertex point for the intersection polygon. + """ + + def inside(p): + return (cp2[0] - cp1[0]) * (p[1] - cp1[1]) > (cp2[1] - cp1[1]) * ( + p[0] - cp1[0] + ) + + def computeIntersection(): + dc = [cp1[0] - cp2[0], cp1[1] - cp2[1]] + dp = [s[0] - e[0], s[1] - e[1]] + n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0] + n2 = s[0] * e[1] - s[1] * e[0] + n3 = 1.0 / (dc[0] * dp[1] - dc[1] * dp[0]) + return [(n1 * dp[0] - n2 * dc[0]) * n3, (n1 * dp[1] - n2 * dc[1]) * n3] + + outputList = subjectPolygon + cp1 = clipPolygon[-1] + + for clipVertex in clipPolygon: + cp2 = clipVertex + inputList = outputList + outputList = [] + s = inputList[-1] + + for subjectVertex in inputList: + e = subjectVertex + if inside(e): + if not inside(s): + outputList.append(computeIntersection()) + outputList.append(e) + elif inside(s): + outputList.append(computeIntersection()) + s = e + cp1 = cp2 + if len(outputList) == 0: + return None + return outputList + + +def poly_area(x, y): + """Ref: http://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates""" + return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1))) + + +def convex_hull_intersection(p1, p2): + """Compute area of two convex hull's intersection area. + p1,p2 are a list of (x,y) tuples of hull vertices. + return a list of (x,y) for the intersection and its volume + """ + inter_p = polygon_clip(p1, p2) + if inter_p is not None: + hull_inter = ConvexHull(inter_p) + return inter_p, hull_inter.volume + else: + return None, 0.0 + + +def box3d_vol(corners): + """corners: (8,3) no assumption on axis direction""" + a = np.sqrt(np.sum((corners[0, :] - corners[1, :]) ** 2)) + b = np.sqrt(np.sum((corners[1, :] - corners[2, :]) ** 2)) + c = np.sqrt(np.sum((corners[0, :] - corners[4, :]) ** 2)) + return a * b * c + + +def is_clockwise(p): + x = p[:, 0] + y = p[:, 1] + return np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)) > 0 + + +def box3d_iou(corners1, corners2): + """Compute 3D bounding box IoU. + + Input: + corners1: numpy array (8,3), assume up direction is negative Y + corners2: numpy array (8,3), assume up direction is negative Y + Output: + iou: 3D bounding box IoU + iou_2d: bird's eye view 2D bounding box IoU + + todo (rqi): add more description on corner points' orders. + """ + # corner points are in counter clockwise order + rect1 = [(corners1[i, 0], corners1[i, 2]) for i in range(3, -1, -1)] + rect2 = [(corners2[i, 0], corners2[i, 2]) for i in range(3, -1, -1)] + area1 = poly_area(np.array(rect1)[:, 0], np.array(rect1)[:, 1]) + area2 = poly_area(np.array(rect2)[:, 0], np.array(rect2)[:, 1]) + inter, inter_area = convex_hull_intersection(rect1, rect2) + iou_2d = inter_area / (area1 + area2 - inter_area) + ymax = min(corners1[0, 1], corners2[0, 1]) + ymin = max(corners1[4, 1], corners2[4, 1]) + inter_vol = inter_area * max(0.0, ymax - ymin) + vol1 = box3d_vol(corners1) + vol2 = box3d_vol(corners2) + iou = inter_vol / (vol1 + vol2 - inter_vol) + return iou, iou_2d + + +def get_iou(bb1, bb2): + """ + Calculate the Intersection over Union (IoU) of two 2D bounding boxes. + + Parameters + ---------- + bb1 : dict + Keys: {'x1', 'x2', 'y1', 'y2'} + The (x1, y1) position is at the top left corner, + the (x2, y2) position is at the bottom right corner + bb2 : dict + Keys: {'x1', 'x2', 'y1', 'y2'} + The (x, y) position is at the top left corner, + the (x2, y2) position is at the bottom right corner + + Returns + ------- + float + in [0, 1] + """ + assert bb1["x1"] < bb1["x2"] + assert bb1["y1"] < bb1["y2"] + assert bb2["x1"] < bb2["x2"] + assert bb2["y1"] < bb2["y2"] + + # determine the coordinates of the intersection rectangle + x_left = max(bb1["x1"], bb2["x1"]) + y_top = max(bb1["y1"], bb2["y1"]) + x_right = min(bb1["x2"], bb2["x2"]) + y_bottom = min(bb1["y2"], bb2["y2"]) + + if x_right < x_left or y_bottom < y_top: + return 0.0 + + # The intersection of two axis-aligned bounding boxes is always an + # axis-aligned bounding box + intersection_area = (x_right - x_left) * (y_bottom - y_top) + + # compute the area of both AABBs + bb1_area = (bb1["x2"] - bb1["x1"]) * (bb1["y2"] - bb1["y1"]) + bb2_area = (bb2["x2"] - bb2["x1"]) * (bb2["y2"] - bb2["y1"]) + + # compute the intersection over union by taking the intersection + # area and dividing it by the sum of prediction + ground-truth + # areas - the interesection area + iou = intersection_area / float(bb1_area + bb2_area - intersection_area) + assert iou >= 0.0 + assert iou <= 1.0 + return iou + + +def box2d_iou(box1, box2): + """Compute 2D bounding box IoU. + + Input: + box1: tuple of (xmin,ymin,xmax,ymax) + box2: tuple of (xmin,ymin,xmax,ymax) + Output: + iou: 2D IoU scalar + """ + return get_iou( + {"x1": box1[0], "y1": box1[1], "x2": box1[2], "y2": box1[3]}, + {"x1": box2[0], "y1": box2[1], "x2": box2[2], "y2": box2[3]}, + ) + + +# ----------------------------------------------------------- +# Convert from box parameters to +# ----------------------------------------------------------- +def roty(t): + """Rotation about the y-axis.""" + c = np.cos(t) + s = np.sin(t) + return np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]]) + + +def roty_batch(t): + """Rotation about the y-axis. + t: (x1,x2,...xn) + return: (x1,x2,...,xn,3,3) + """ + input_shape = t.shape + output = np.zeros(tuple(list(input_shape) + [3, 3])) + c = np.cos(t) + s = np.sin(t) + output[..., 0, 0] = c + output[..., 0, 2] = s + output[..., 1, 1] = 1 + output[..., 2, 0] = -s + output[..., 2, 2] = c + return output + + +def get_3d_box(box_size, heading_angle, center): + """box_size is array(l,w,h), heading_angle is radius clockwise from pos x axis, center is xyz of box center + output (8,3) array for 3D box cornders + Similar to utils/compute_orientation_3d + """ + R = roty(heading_angle) + l, w, h = box_size + x_corners = [l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2] + y_corners = [h / 2, h / 2, h / 2, h / 2, -h / 2, -h / 2, -h / 2, -h / 2] + z_corners = [w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2] + corners_3d = np.dot(R, np.vstack([x_corners, y_corners, z_corners])) + corners_3d[0, :] = corners_3d[0, :] + center[0] + corners_3d[1, :] = corners_3d[1, :] + center[1] + corners_3d[2, :] = corners_3d[2, :] + center[2] + corners_3d = np.transpose(corners_3d) + return corners_3d + + +def get_3d_box_batch(box_size, heading_angle, center): + """box_size: [x1,x2,...,xn,3] + heading_angle: [x1,x2,...,xn] + center: [x1,x2,...,xn,3] + Return: + [x1,x3,...,xn,8,3] + """ + input_shape = heading_angle.shape + R = roty_batch(heading_angle) + l = np.expand_dims(box_size[..., 0], -1) # [x1,...,xn,1] + w = np.expand_dims(box_size[..., 1], -1) + h = np.expand_dims(box_size[..., 2], -1) + corners_3d = np.zeros(tuple(list(input_shape) + [8, 3])) + corners_3d[..., :, 0] = np.concatenate( + (l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2), -1 + ) + corners_3d[..., :, 1] = np.concatenate( + (h / 2, h / 2, h / 2, h / 2, -h / 2, -h / 2, -h / 2, -h / 2), -1 + ) + corners_3d[..., :, 2] = np.concatenate( + (w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2), -1 + ) + tlist = [i for i in range(len(input_shape))] + tlist += [len(input_shape) + 1, len(input_shape)] + corners_3d = np.matmul(corners_3d, np.transpose(R, tuple(tlist))) + corners_3d += np.expand_dims(center, -2) + return corners_3d + + +if __name__ == "__main__": + + # Function for polygon ploting + import matplotlib + from matplotlib.patches import Polygon + from matplotlib.collections import PatchCollection + import matplotlib.pyplot as plt + + def plot_polys(plist, scale=500.0): + fig, ax = plt.subplots() + patches = [] + for p in plist: + poly = Polygon(np.array(p) / scale, True) + patches.append(poly) + + pc = PatchCollection(patches, cmap=matplotlib.cm.jet, alpha=0.5) + colors = 100 * np.random.rand(len(patches)) + pc.set_array(np.array(colors)) + ax.add_collection(pc) + plt.show() + + # Demo on ConvexHull + points = np.random.rand(30, 2) # 30 random points in 2-D + hull = ConvexHull(points) + # **In 2D "volume" is is area, "area" is perimeter + print(("Hull area: ", hull.volume)) + for simplex in hull.simplices: + print(simplex) + + # Demo on convex hull overlaps + sub_poly = [(0, 0), (300, 0), (300, 300), (0, 300)] + clip_poly = [(150, 150), (300, 300), (150, 450), (0, 300)] + inter_poly = polygon_clip(sub_poly, clip_poly) + print(poly_area(np.array(inter_poly)[:, 0], np.array(inter_poly)[:, 1])) + + # Test convex hull interaction function + rect1 = [(50, 0), (50, 300), (300, 300), (300, 0)] + rect2 = [(150, 150), (300, 300), (150, 450), (0, 300)] + plot_polys([rect1, rect2]) + inter, area = convex_hull_intersection(rect1, rect2) + print((inter, area)) + if inter is not None: + print(poly_area(np.array(inter)[:, 0], np.array(inter)[:, 1])) + + print("------------------") + rect1 = [ + (0.30026005199835404, 8.9408694211408424), + (-1.1571105364358421, 9.4686676477075533), + (0.1777082043006144, 13.154404877812102), + (1.6350787927348105, 12.626606651245391), + ] + rect1 = [rect1[0], rect1[3], rect1[2], rect1[1]] + rect2 = [ + (0.23908745901608636, 8.8551095691132886), + (-1.2771419487733995, 9.4269062966181956), + (0.13138836963152717, 13.161896351296868), + (1.647617777421013, 12.590099623791961), + ] + rect2 = [rect2[0], rect2[3], rect2[2], rect2[1]] + plot_polys([rect1, rect2]) + inter, area = convex_hull_intersection(rect1, rect2) + print((inter, area)) diff --git a/models/Mask3D/mask3d/utils/votenet_utils/eval_det.py b/models/Mask3D/mask3d/utils/votenet_utils/eval_det.py new file mode 100644 index 0000000000000000000000000000000000000000..77f4f73bfd428cd31c7de9b78286ddb6d40473ff --- /dev/null +++ b/models/Mask3D/mask3d/utils/votenet_utils/eval_det.py @@ -0,0 +1,310 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" Generic Code for Object Detection Evaluation + + Input: + For each class: + For each image: + Predictions: box, score + Groundtruths: box + + Output: + For each class: + precision-recal and average precision + + Author: Charles R. Qi + + Ref: https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/master/lib/datasets/voc_eval.py +""" +import numpy as np + + +def voc_ap(rec, prec, use_07_metric=False): + """ap = voc_ap(rec, prec, [use_07_metric]) + Compute VOC AP given precision and recall. + If use_07_metric is true, uses the + VOC 07 11 point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0.0 + for t in np.arange(0.0, 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11.0 + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.0], rec, [1.0])) + mpre = np.concatenate(([0.0], prec, [0.0])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +import os +import sys + +# BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +from utils.votenet_utils.metric_util import calc_iou # axis-aligned 3D box IoU + + +def get_iou(bb1, bb2): + """Compute IoU of two bounding boxes. + ** Define your bod IoU function HERE ** + """ + # pass + iou3d = calc_iou(bb1, bb2) + return iou3d + + +from box_util import box3d_iou + + +def get_iou_obb(bb1, bb2): + iou3d, iou2d = box3d_iou(bb1, bb2) + return iou3d + + +def get_iou_main(get_iou_func, args): + return get_iou_func(*args) + + +def eval_det_cls( + pred, gt, ovthresh=0.25, use_07_metric=False, get_iou_func=get_iou +): + """Generic functions to compute precision/recall for object detection + for a single class. + Input: + pred: map of {img_id: [(bbox, score)]} where bbox is numpy array + gt: map of {img_id: [bbox]} + ovthresh: scalar, iou threshold + use_07_metric: bool, if True use VOC07 11 point method + Output: + rec: numpy array of length nd + prec: numpy array of length nd + ap: scalar, average precision + """ + + # construct gt objects + class_recs = {} # {img_id: {'bbox': bbox list, 'det': matched list}} + npos = 0 + for img_id in gt.keys(): + bbox = np.array(gt[img_id]) + det = [False] * len(bbox) + npos += len(bbox) + class_recs[img_id] = {"bbox": bbox, "det": det} + # pad empty list to all other imgids + for img_id in pred.keys(): + if img_id not in gt: + class_recs[img_id] = {"bbox": np.array([]), "det": []} + + # construct dets + image_ids = [] + confidence = [] + BB = [] + for img_id in pred.keys(): + for box, score in pred[img_id]: + image_ids.append(img_id) + confidence.append(score) + BB.append(box) + confidence = np.array(confidence) + BB = np.array(BB) # (nd,4 or 8,3 or 6) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + sorted_scores = np.sort(-confidence) + BB = BB[sorted_ind, ...] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + # if d%100==0: print(d) + R = class_recs[image_ids[d]] + bb = BB[d, ...].astype(float) + ovmax = -np.inf + BBGT = R["bbox"].astype(float) + + if BBGT.size > 0: + # compute overlaps + for j in range(BBGT.shape[0]): + iou = get_iou_main(get_iou_func, (bb, BBGT[j, ...])) + if iou > ovmax: + ovmax = iou + jmax = j + + # print d, ovmax + if ovmax > ovthresh: + if not R["det"][jmax]: + tp[d] = 1.0 + R["det"][jmax] = 1 + else: + fp[d] = 1.0 + else: + fp[d] = 1.0 + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # print('NPOS: ', npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap + + +def eval_det_cls_wrapper(arguments): + pred, gt, ovthresh, use_07_metric, get_iou_func = arguments + rec, prec, ap = eval_det_cls( + pred, gt, ovthresh, use_07_metric, get_iou_func + ) + return (rec, prec, ap) + + +def eval_det( + pred_all, gt_all, ovthresh=0.25, use_07_metric=False, get_iou_func=get_iou +): + """Generic functions to compute precision/recall for object detection + for multiple classes. + Input: + pred_all: map of {img_id: [(classname, bbox, score)]} + gt_all: map of {img_id: [(classname, bbox)]} + ovthresh: scalar, iou threshold + use_07_metric: bool, if true use VOC07 11 point method + Output: + rec: {classname: rec} + prec: {classname: prec_all} + ap: {classname: scalar} + """ + pred = {} # map {classname: pred} + gt = {} # map {classname: gt} + for img_id in pred_all.keys(): + for classname, bbox, score in pred_all[img_id]: + if classname not in pred: + pred[classname] = {} + if img_id not in pred[classname]: + pred[classname][img_id] = [] + if classname not in gt: + gt[classname] = {} + if img_id not in gt[classname]: + gt[classname][img_id] = [] + pred[classname][img_id].append((bbox, score)) + for img_id in gt_all.keys(): + for classname, bbox in gt_all[img_id]: + if classname not in gt: + gt[classname] = {} + if img_id not in gt[classname]: + gt[classname][img_id] = [] + # JONAS ADAPTATION TODO + if classname not in pred: + pred[classname] = {} + if img_id not in pred[classname]: + pred[classname][img_id] = [] + # ===================== + gt[classname][img_id].append(bbox) + + rec = {} + prec = {} + ap = {} + for classname in gt.keys(): + print("Computing AP for class: ", classname) + rec[classname], prec[classname], ap[classname] = eval_det_cls( + pred[classname], + gt[classname], + ovthresh, + use_07_metric, + get_iou_func, + ) + print(classname, ap[classname]) + + return rec, prec, ap + + +from multiprocessing import Pool + + +def eval_det_multiprocessing( + pred_all, gt_all, ovthresh=0.25, use_07_metric=False, get_iou_func=get_iou +): + """Generic functions to compute precision/recall for object detection + for multiple classes. + Input: + pred_all: map of {img_id: [(classname, bbox, score)]} + gt_all: map of {img_id: [(classname, bbox)]} + ovthresh: scalar, iou threshold + use_07_metric: bool, if true use VOC07 11 point method + Output: + rec: {classname: rec} + prec: {classname: prec_all} + ap: {classname: scalar} + """ + pred = {} # map {classname: pred} + gt = {} # map {classname: gt} + for img_id in pred_all.keys(): + for classname, bbox, score in pred_all[img_id]: + if classname not in pred: + pred[classname] = {} + if img_id not in pred[classname]: + pred[classname][img_id] = [] + if classname not in gt: + gt[classname] = {} + if img_id not in gt[classname]: + gt[classname][img_id] = [] + pred[classname][img_id].append((bbox, score)) + for img_id in gt_all.keys(): + for classname, bbox in gt_all[img_id]: + if classname not in gt: + gt[classname] = {} + if img_id not in gt[classname]: + gt[classname][img_id] = [] + gt[classname][img_id].append(bbox) + + rec = {} + prec = {} + ap = {} + p = Pool(processes=10) + ret_values = p.map( + eval_det_cls_wrapper, + [ + ( + pred[classname], + gt[classname], + ovthresh, + use_07_metric, + get_iou_func, + ) + for classname in gt.keys() + if classname in pred + ], + ) + p.close() + for i, classname in enumerate(gt.keys()): + if classname in pred: + rec[classname], prec[classname], ap[classname] = ret_values[i] + else: + rec[classname] = 0 + prec[classname] = 0 + ap[classname] = 0 + print(classname, ap[classname]) + + return rec, prec, ap diff --git a/models/Mask3D/mask3d/utils/votenet_utils/metric_util.py b/models/Mask3D/mask3d/utils/votenet_utils/metric_util.py new file mode 100644 index 0000000000000000000000000000000000000000..312589a5e0fa96e2153026a2fa4bd9053b890f54 --- /dev/null +++ b/models/Mask3D/mask3d/utils/votenet_utils/metric_util.py @@ -0,0 +1,194 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" Utility functions for metric evaluation. + +Author: Or Litany and Charles R. Qi +""" + +import os +import sys +import torch + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) + +import numpy as np + +# Mesh IO +import trimesh + + +# ---------------------------------------- +# Precision and Recall +# ---------------------------------------- + + +def multi_scene_precision_recall( + labels, pred, iou_thresh, conf_thresh, label_mask, pred_mask=None +): + """ + Args: + labels: (B, N, 6) + pred: (B, M, 6) + iou_thresh: scalar + conf_thresh: scalar + label_mask: (B, N,) with values in 0 or 1 to indicate which GT boxes to consider. + pred_mask: (B, M,) with values in 0 or 1 to indicate which PRED boxes to consider. + Returns: + TP,FP,FN,Precision,Recall + """ + # Make sure the masks are not Torch tensor, otherwise the mask==1 returns uint8 array instead + # of True/False array as in numpy + assert not torch.is_tensor(label_mask) + assert not torch.is_tensor(pred_mask) + TP, FP, FN = 0, 0, 0 + if label_mask is None: + label_mask = np.ones((labels.shape[0], labels.shape[1])) + if pred_mask is None: + pred_mask = np.ones((pred.shape[0], pred.shape[1])) + for batch_idx in range(labels.shape[0]): + TP_i, FP_i, FN_i = single_scene_precision_recall( + labels[batch_idx, label_mask[batch_idx, :] == 1, :], + pred[batch_idx, pred_mask[batch_idx, :] == 1, :], + iou_thresh, + conf_thresh, + ) + TP += TP_i + FP += FP_i + FN += FN_i + + return TP, FP, FN, precision_recall(TP, FP, FN) + + +def single_scene_precision_recall(labels, pred, iou_thresh, conf_thresh): + """Compute P and R for predicted bounding boxes. Ignores classes! + Args: + labels: (N x bbox) ground-truth bounding boxes (6 dims) + pred: (M x (bbox + conf)) predicted bboxes with confidence and maybe classification + Returns: + TP, FP, FN + """ + + # for each pred box with high conf (C), compute IoU with all gt boxes. + # TP = number of times IoU > th ; FP = C - TP + # FN - number of scene objects without good match + + gt_bboxes = labels[:, :6] + + num_scene_bboxes = gt_bboxes.shape[0] + conf = pred[:, 6] + + conf_pred_bbox = pred[np.where(conf > conf_thresh)[0], :6] + num_conf_pred_bboxes = conf_pred_bbox.shape[0] + + # init an array to keep iou between generated and scene bboxes + iou_arr = np.zeros([num_conf_pred_bboxes, num_scene_bboxes]) + for g_idx in range(num_conf_pred_bboxes): + for s_idx in range(num_scene_bboxes): + iou_arr[g_idx, s_idx] = calc_iou( + conf_pred_bbox[g_idx, :], gt_bboxes[s_idx, :] + ) + + good_match_arr = iou_arr >= iou_thresh + + TP = good_match_arr.any(axis=1).sum() + FP = num_conf_pred_bboxes - TP + FN = num_scene_bboxes - good_match_arr.any(axis=0).sum() + + return TP, FP, FN + + +def precision_recall(TP, FP, FN): + Prec = 1.0 * TP / (TP + FP) if TP + FP > 0 else 0 + Rec = 1.0 * TP / (TP + FN) + return Prec, Rec + + +def calc_iou(box_a, box_b): + """Computes IoU of two axis aligned bboxes. + Args: + box_a, box_b: 6D of center and lengths + Returns: + iou + """ + + max_a = box_a[0:3] + box_a[3:6] / 2 + max_b = box_b[0:3] + box_b[3:6] / 2 + min_max = np.array([max_a, max_b]).min(0) + + min_a = box_a[0:3] - box_a[3:6] / 2 + min_b = box_b[0:3] - box_b[3:6] / 2 + max_min = np.array([min_a, min_b]).max(0) + if not ((min_max > max_min).all()): + return 0.0 + + intersection = (min_max - max_min).prod() + vol_a = box_a[3:6].prod() + vol_b = box_b[3:6].prod() + union = vol_a + vol_b - intersection + return 1.0 * intersection / union + + +if __name__ == "__main__": + print("running some tests") + + ############ + ## Test IoU + ############ + box_a = np.array([0, 0, 0, 1, 1, 1]) + box_b = np.array([0, 0, 0, 2, 2, 2]) + expected_iou = 1.0 / 8 + pred_iou = calc_iou(box_a, box_b) + assert expected_iou == pred_iou, "function returned wrong IoU" + + box_a = np.array([0, 0, 0, 1, 1, 1]) + box_b = np.array([10, 10, 10, 2, 2, 2]) + expected_iou = 0.0 + pred_iou = calc_iou(box_a, box_b) + assert expected_iou == pred_iou, "function returned wrong IoU" + + print("IoU test -- PASSED") + + ######################### + ## Test Precition Recall + ######################### + gt_boxes = np.array([[0, 0, 0, 1, 1, 1], [3, 0, 1, 1, 10, 1]]) + detected_boxes = np.array( + [[0, 0, 0, 1, 1, 1, 1.0], [3, 0, 1, 1, 10, 1, 0.9]] + ) + TP, FP, FN = single_scene_precision_recall( + gt_boxes, detected_boxes, 0.5, 0.5 + ) + assert TP == 2 and FP == 0 and FN == 0 + assert precision_recall(TP, FP, FN) == (1, 1) + + detected_boxes = np.array([[0, 0, 0, 1, 1, 1, 1.0]]) + TP, FP, FN = single_scene_precision_recall( + gt_boxes, detected_boxes, 0.5, 0.5 + ) + assert TP == 1 and FP == 0 and FN == 1 + assert precision_recall(TP, FP, FN) == (1, 0.5) + + detected_boxes = np.array( + [[0, 0, 0, 1, 1, 1, 1.0], [-1, -1, 0, 0.1, 0.1, 1, 1.0]] + ) + TP, FP, FN = single_scene_precision_recall( + gt_boxes, detected_boxes, 0.5, 0.5 + ) + assert TP == 1 and FP == 1 and FN == 1 + assert precision_recall(TP, FP, FN) == (0.5, 0.5) + + # wrong box has low confidence + detected_boxes = np.array( + [[0, 0, 0, 1, 1, 1, 1.0], [-1, -1, 0, 0.1, 0.1, 1, 0.1]] + ) + TP, FP, FN = single_scene_precision_recall( + gt_boxes, detected_boxes, 0.5, 0.5 + ) + assert TP == 1 and FP == 0 and FN == 1 + assert precision_recall(TP, FP, FN) == (1, 0.5) + + print("Precition Recall test -- PASSED") diff --git a/models/Mask3D/mask3d/utils/votenet_utils/nms.py b/models/Mask3D/mask3d/utils/votenet_utils/nms.py new file mode 100644 index 0000000000000000000000000000000000000000..9ad74f846ce16396190a6772546d7c0785c308a2 --- /dev/null +++ b/models/Mask3D/mask3d/utils/votenet_utils/nms.py @@ -0,0 +1,195 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +from pc_util import bbox_corner_dist_measure + +# boxes are axis aigned 2D boxes of shape (n,5) in FLOAT numbers with (x1,y1,x2,y2,score) +""" Ref: https://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/ +Ref: https://github.com/vickyboy47/nms-python/blob/master/nms.py +""" + + +def nms_2d(boxes, overlap_threshold): + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + score = boxes[:, 4] + area = (x2 - x1) * (y2 - y1) + + I = np.argsort(score) + pick = [] + while I.size != 0: + last = I.size + i = I[-1] + pick.append(i) + suppress = [last - 1] + for pos in range(last - 1): + j = I[pos] + xx1 = max(x1[i], x1[j]) + yy1 = max(y1[i], y1[j]) + xx2 = min(x2[i], x2[j]) + yy2 = min(y2[i], y2[j]) + w = xx2 - xx1 + h = yy2 - yy1 + if w > 0 and h > 0: + o = w * h / area[j] + print("Overlap is", o) + if o > overlap_threshold: + suppress.append(pos) + I = np.delete(I, suppress) + return pick + + +def nms_2d_faster(boxes, overlap_threshold, old_type=False): + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + score = boxes[:, 4] + area = (x2 - x1) * (y2 - y1) + + I = np.argsort(score) + pick = [] + while I.size != 0: + last = I.size + i = I[-1] + pick.append(i) + + xx1 = np.maximum(x1[i], x1[I[: last - 1]]) + yy1 = np.maximum(y1[i], y1[I[: last - 1]]) + xx2 = np.minimum(x2[i], x2[I[: last - 1]]) + yy2 = np.minimum(y2[i], y2[I[: last - 1]]) + + w = np.maximum(0, xx2 - xx1) + h = np.maximum(0, yy2 - yy1) + + if old_type: + o = (w * h) / area[I[: last - 1]] + else: + inter = w * h + o = inter / (area[i] + area[I[: last - 1]] - inter) + + I = np.delete( + I, np.concatenate(([last - 1], np.where(o > overlap_threshold)[0])) + ) + + return pick + + +def nms_3d_faster(boxes, overlap_threshold, old_type=False): + x1 = boxes[:, 0] + y1 = boxes[:, 1] + z1 = boxes[:, 2] + x2 = boxes[:, 3] + y2 = boxes[:, 4] + z2 = boxes[:, 5] + score = boxes[:, 6] + area = (x2 - x1) * (y2 - y1) * (z2 - z1) + + I = np.argsort(score) + pick = [] + while I.size != 0: + last = I.size + i = I[-1] + pick.append(i) + + xx1 = np.maximum(x1[i], x1[I[: last - 1]]) + yy1 = np.maximum(y1[i], y1[I[: last - 1]]) + zz1 = np.maximum(z1[i], z1[I[: last - 1]]) + xx2 = np.minimum(x2[i], x2[I[: last - 1]]) + yy2 = np.minimum(y2[i], y2[I[: last - 1]]) + zz2 = np.minimum(z2[i], z2[I[: last - 1]]) + + l = np.maximum(0, xx2 - xx1) + w = np.maximum(0, yy2 - yy1) + h = np.maximum(0, zz2 - zz1) + + if old_type: + o = (l * w * h) / area[I[: last - 1]] + else: + inter = l * w * h + o = inter / (area[i] + area[I[: last - 1]] - inter) + + I = np.delete( + I, np.concatenate(([last - 1], np.where(o > overlap_threshold)[0])) + ) + + return pick + + +def nms_3d_faster_samecls(boxes, overlap_threshold, old_type=False): + x1 = boxes[:, 0] + y1 = boxes[:, 1] + z1 = boxes[:, 2] + x2 = boxes[:, 3] + y2 = boxes[:, 4] + z2 = boxes[:, 5] + score = boxes[:, 6] + cls = boxes[:, 7] + area = (x2 - x1) * (y2 - y1) * (z2 - z1) + + I = np.argsort(score) + pick = [] + while I.size != 0: + last = I.size + i = I[-1] + pick.append(i) + + xx1 = np.maximum(x1[i], x1[I[: last - 1]]) + yy1 = np.maximum(y1[i], y1[I[: last - 1]]) + zz1 = np.maximum(z1[i], z1[I[: last - 1]]) + xx2 = np.minimum(x2[i], x2[I[: last - 1]]) + yy2 = np.minimum(y2[i], y2[I[: last - 1]]) + zz2 = np.minimum(z2[i], z2[I[: last - 1]]) + cls1 = cls[i] + cls2 = cls[I[: last - 1]] + + l = np.maximum(0, xx2 - xx1) + w = np.maximum(0, yy2 - yy1) + h = np.maximum(0, zz2 - zz1) + + if old_type: + o = (l * w * h) / area[I[: last - 1]] + else: + inter = l * w * h + o = inter / (area[i] + area[I[: last - 1]] - inter) + o = o * (cls1 == cls2) + + I = np.delete( + I, np.concatenate(([last - 1], np.where(o > overlap_threshold)[0])) + ) + + return pick + + +def nms_crnr_dist(boxes, conf, overlap_threshold): + + I = np.argsort(conf) + pick = [] + while I.size != 0: + last = I.size + i = I[-1] + pick.append(i) + + scores = [] + for ind in I[:-1]: + scores.append(bbox_corner_dist_measure(boxes[i, :], boxes[ind, :])) + + I = np.delete( + I, + np.concatenate( + ([last - 1], np.where(np.array(scores) > overlap_threshold)[0]) + ), + ) + + return pick + + +if __name__ == "__main__": + a = np.random.random((100, 5)) + print(nms_2d(a, 0.9)) + print(nms_2d_faster(a, 0.9)) diff --git a/models/Mask3D/mask3d/utils/votenet_utils/nn_distance.py b/models/Mask3D/mask3d/utils/votenet_utils/nn_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..b713cee2be890120cf4430b71309727793879c53 --- /dev/null +++ b/models/Mask3D/mask3d/utils/votenet_utils/nn_distance.py @@ -0,0 +1,96 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" Chamfer distance in Pytorch. +Author: Charles R. Qi +""" + +import torch +import torch.nn as nn +import numpy as np + + +def huber_loss(error, delta=1.0): + """ + Args: + error: Torch tensor (d1,d2,...,dk) + Returns: + loss: Torch tensor (d1,d2,...,dk) + + x = error = pred - gt or dist(pred,gt) + 0.5 * |x|^2 if |x|<=d + 0.5 * d^2 + d * (|x|-d) if |x|>d + Ref: https://github.com/charlesq34/frustum-pointnets/blob/master/models/model_util.py + """ + abs_error = torch.abs(error) + # quadratic = torch.min(abs_error, torch.FloatTensor([delta])) + quadratic = torch.clamp(abs_error, max=delta) + linear = abs_error - quadratic + loss = 0.5 * quadratic**2 + delta * linear + return loss + + +def nn_distance(pc1, pc2, l1smooth=False, delta=1.0, l1=False): + """ + Input: + pc1: (B,N,C) torch tensor + pc2: (B,M,C) torch tensor + l1smooth: bool, whether to use l1smooth loss + delta: scalar, the delta used in l1smooth loss + Output: + dist1: (B,N) torch float32 tensor + idx1: (B,N) torch int64 tensor + dist2: (B,M) torch float32 tensor + idx2: (B,M) torch int64 tensor + """ + N = pc1.shape[1] + M = pc2.shape[1] + pc1_expand_tile = pc1.unsqueeze(2).repeat(1, 1, M, 1) + pc2_expand_tile = pc2.unsqueeze(1).repeat(1, N, 1, 1) + pc_diff = pc1_expand_tile - pc2_expand_tile + + if l1smooth: + pc_dist = torch.sum(huber_loss(pc_diff, delta), dim=-1) # (B,N,M) + elif l1: + pc_dist = torch.sum(torch.abs(pc_diff), dim=-1) # (B,N,M) + else: + pc_dist = torch.sum(pc_diff**2, dim=-1) # (B,N,M) + dist1, idx1 = torch.min(pc_dist, dim=2) # (B,N) + dist2, idx2 = torch.min(pc_dist, dim=1) # (B,M) + return dist1, idx1, dist2, idx2 + + +def demo_nn_distance(): + np.random.seed(0) + pc1arr = np.random.random((1, 5, 3)) + pc2arr = np.random.random((1, 6, 3)) + pc1 = torch.from_numpy(pc1arr.astype(np.float32)) + pc2 = torch.from_numpy(pc2arr.astype(np.float32)) + dist1, idx1, dist2, idx2 = nn_distance(pc1, pc2) + print(dist1) + print(idx1) + dist = np.zeros((5, 6)) + for i in range(5): + for j in range(6): + dist[i, j] = np.sum((pc1arr[0, i, :] - pc2arr[0, j, :]) ** 2) + print(dist) + print("-" * 30) + print("L1smooth dists:") + dist1, idx1, dist2, idx2 = nn_distance(pc1, pc2, True) + print(dist1) + print(idx1) + dist = np.zeros((5, 6)) + for i in range(5): + for j in range(6): + error = np.abs(pc1arr[0, i, :] - pc2arr[0, j, :]) + quad = np.minimum(error, 1.0) + linear = error - quad + loss = 0.5 * quad**2 + 1.0 * linear + dist[i, j] = np.sum(loss) + print(dist) + + +if __name__ == "__main__": + demo_nn_distance() diff --git a/models/Mask3D/mask3d/utils/votenet_utils/pc_util.py b/models/Mask3D/mask3d/utils/votenet_utils/pc_util.py new file mode 100644 index 0000000000000000000000000000000000000000..765c064a3da328f7b30f7798512b811f8b9e75a7 --- /dev/null +++ b/models/Mask3D/mask3d/utils/votenet_utils/pc_util.py @@ -0,0 +1,607 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" Utility functions for processing point clouds. + +Author: Charles R. Qi and Or Litany +""" + +import os +import sys + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) + +# Point cloud IO +import numpy as np + +try: + from plyfile import PlyData, PlyElement +except: + print("Please install the module 'plyfile' for PLY i/o, e.g.") + print("pip install plyfile") + sys.exit(-1) + + +# Mesh IO +import trimesh + +import matplotlib.pyplot as pyplot + +# ---------------------------------------- +# Point Cloud Sampling +# ---------------------------------------- + + +def random_sampling(pc, num_sample, replace=None, return_choices=False): + """Input is NxC, output is num_samplexC""" + if replace is None: + replace = pc.shape[0] < num_sample + choices = np.random.choice(pc.shape[0], num_sample, replace=replace) + if return_choices: + return pc[choices], choices + else: + return pc[choices] + + +# ---------------------------------------- +# Point Cloud/Volume Conversions +# ---------------------------------------- + + +def point_cloud_to_volume_batch( + point_clouds, vsize=12, radius=1.0, flatten=True +): + """Input is BxNx3 batch of point cloud + Output is Bx(vsize^3) + """ + vol_list = [] + for b in range(point_clouds.shape[0]): + vol = point_cloud_to_volume( + np.squeeze(point_clouds[b, :, :]), vsize, radius + ) + if flatten: + vol_list.append(vol.flatten()) + else: + vol_list.append(np.expand_dims(np.expand_dims(vol, -1), 0)) + if flatten: + return np.vstack(vol_list) + else: + return np.concatenate(vol_list, 0) + + +def point_cloud_to_volume(points, vsize, radius=1.0): + """input is Nx3 points. + output is vsize*vsize*vsize + assumes points are in range [-radius, radius] + """ + vol = np.zeros((vsize, vsize, vsize)) + voxel = 2 * radius / float(vsize) + locations = (points + radius) / voxel + locations = locations.astype(int) + vol[locations[:, 0], locations[:, 1], locations[:, 2]] = 1.0 + return vol + + +def volume_to_point_cloud(vol): + """vol is occupancy grid (value = 0 or 1) of size vsize*vsize*vsize + return Nx3 numpy array. + """ + vsize = vol.shape[0] + assert vol.shape[1] == vsize and vol.shape[1] == vsize + points = [] + for a in range(vsize): + for b in range(vsize): + for c in range(vsize): + if vol[a, b, c] == 1: + points.append(np.array([a, b, c])) + if len(points) == 0: + return np.zeros((0, 3)) + points = np.vstack(points) + return points + + +def point_cloud_to_volume_v2_batch( + point_clouds, vsize=12, radius=1.0, num_sample=128 +): + """Input is BxNx3 a batch of point cloud + Output is BxVxVxVxnum_samplex3 + Added on Feb 19 + """ + vol_list = [] + for b in range(point_clouds.shape[0]): + vol = point_cloud_to_volume_v2( + point_clouds[b, :, :], vsize, radius, num_sample + ) + vol_list.append(np.expand_dims(vol, 0)) + return np.concatenate(vol_list, 0) + + +def point_cloud_to_volume_v2(points, vsize, radius=1.0, num_sample=128): + """input is Nx3 points + output is vsize*vsize*vsize*num_sample*3 + assumes points are in range [-radius, radius] + samples num_sample points in each voxel, if there are less than + num_sample points, replicate the points + Added on Feb 19 + """ + vol = np.zeros((vsize, vsize, vsize, num_sample, 3)) + voxel = 2 * radius / float(vsize) + locations = (points + radius) / voxel + locations = locations.astype(int) + loc2pc = {} + for n in range(points.shape[0]): + loc = tuple(locations[n, :]) + if loc not in loc2pc: + loc2pc[loc] = [] + loc2pc[loc].append(points[n, :]) + + for i in range(vsize): + for j in range(vsize): + for k in range(vsize): + if (i, j, k) not in loc2pc: + vol[i, j, k, :, :] = np.zeros((num_sample, 3)) + else: + pc = loc2pc[(i, j, k)] # a list of (3,) arrays + pc = np.vstack(pc) # kx3 + # Sample/pad to num_sample points + if pc.shape[0] > num_sample: + pc = random_sampling(pc, num_sample, False) + elif pc.shape[0] < num_sample: + pc = np.lib.pad( + pc, ((0, num_sample - pc.shape[0]), (0, 0)), "edge" + ) + # Normalize + pc_center = (np.array([i, j, k]) + 0.5) * voxel - radius + pc = (pc - pc_center) / voxel # shift and scale + vol[i, j, k, :, :] = pc + return vol + + +def point_cloud_to_image_batch( + point_clouds, imgsize, radius=1.0, num_sample=128 +): + """Input is BxNx3 a batch of point cloud + Output is BxIxIxnum_samplex3 + Added on Feb 19 + """ + img_list = [] + for b in range(point_clouds.shape[0]): + img = point_cloud_to_image( + point_clouds[b, :, :], imgsize, radius, num_sample + ) + img_list.append(np.expand_dims(img, 0)) + return np.concatenate(img_list, 0) + + +def point_cloud_to_image(points, imgsize, radius=1.0, num_sample=128): + """input is Nx3 points + output is imgsize*imgsize*num_sample*3 + assumes points are in range [-radius, radius] + samples num_sample points in each pixel, if there are less than + num_sample points, replicate the points + Added on Feb 19 + """ + img = np.zeros((imgsize, imgsize, num_sample, 3)) + pixel = 2 * radius / float(imgsize) + locations = (points[:, 0:2] + radius) / pixel # Nx2 + locations = locations.astype(int) + loc2pc = {} + for n in range(points.shape[0]): + loc = tuple(locations[n, :]) + if loc not in loc2pc: + loc2pc[loc] = [] + loc2pc[loc].append(points[n, :]) + for i in range(imgsize): + for j in range(imgsize): + if (i, j) not in loc2pc: + img[i, j, :, :] = np.zeros((num_sample, 3)) + else: + pc = loc2pc[(i, j)] + pc = np.vstack(pc) + if pc.shape[0] > num_sample: + pc = random_sampling(pc, num_sample, False) + elif pc.shape[0] < num_sample: + pc = np.lib.pad( + pc, ((0, num_sample - pc.shape[0]), (0, 0)), "edge" + ) + pc_center = (np.array([i, j]) + 0.5) * pixel - radius + pc[:, 0:2] = (pc[:, 0:2] - pc_center) / pixel + img[i, j, :, :] = pc + return img + + +# ---------------------------------------- +# Point cloud IO +# ---------------------------------------- + + +def read_ply(filename): + """read XYZ point cloud from filename PLY file""" + plydata = PlyData.read(filename) + pc = plydata["vertex"].data + pc_array = np.array([[x, y, z] for x, y, z in pc]) + return pc_array + + +def write_ply(points, filename, text=True): + """input: Nx3, write points to filename as PLY format.""" + points = [ + (points[i, 0], points[i, 1], points[i, 2]) + for i in range(points.shape[0]) + ] + vertex = np.array(points, dtype=[("x", "f4"), ("y", "f4"), ("z", "f4")]) + el = PlyElement.describe(vertex, "vertex", comments=["vertices"]) + PlyData([el], text=text).write(filename) + + +def write_ply_color( + points, labels, filename, num_classes=None, colormap=pyplot.cm.jet +): + """Color (N,3) points with labels (N) within range 0 ~ num_classes-1 as OBJ file""" + labels = labels.astype(int) + N = points.shape[0] + if num_classes is None: + num_classes = np.max(labels) + 1 + else: + assert num_classes > np.max(labels) + + vertex = [] + # colors = [pyplot.cm.jet(i/float(num_classes)) for i in range(num_classes)] + colors = [colormap(i / float(num_classes)) for i in range(num_classes)] + for i in range(N): + c = colors[labels[i]] + c = [int(x * 255) for x in c] + vertex.append( + (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]) + ) + vertex = np.array( + vertex, + dtype=[ + ("x", "f4"), + ("y", "f4"), + ("z", "f4"), + ("red", "u1"), + ("green", "u1"), + ("blue", "u1"), + ], + ) + + el = PlyElement.describe(vertex, "vertex", comments=["vertices"]) + PlyData([el], text=True).write(filename) + + +def write_ply_rgb(points, colors, out_filename, num_classes=None): + """Color (N,3) points with RGB colors (N,3) within range [0,255] as OBJ file""" + colors = colors.astype(int) + N = points.shape[0] + fout = open(out_filename, "w") + for i in range(N): + c = colors[i, :] + fout.write( + "v %f %f %f %d %d %d\n" + % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]) + ) + fout.close() + + +# ---------------------------------------- +# Simple Point cloud and Volume Renderers +# ---------------------------------------- + + +def pyplot_draw_point_cloud(points, output_filename): + """points is a Nx3 numpy array""" + import matplotlib.pyplot as plt + + fig = plt.figure() + ax = fig.add_subplot(111, projection="3d") + ax.scatter(points[:, 0], points[:, 1], points[:, 2]) + ax.set_xlabel("x") + ax.set_ylabel("y") + ax.set_zlabel("z") + # savefig(output_filename) + + +def pyplot_draw_volume(vol, output_filename): + """vol is of size vsize*vsize*vsize + output an image to output_filename + """ + points = volume_to_point_cloud(vol) + pyplot_draw_point_cloud(points, output_filename) + + +# ---------------------------------------- +# Simple Point manipulations +# ---------------------------------------- +def rotate_point_cloud(points, rotation_matrix=None): + """Input: (n,3), Output: (n,3)""" + # Rotate in-place around Z axis. + if rotation_matrix is None: + rotation_angle = np.random.uniform() * 2 * np.pi + sinval, cosval = np.sin(rotation_angle), np.cos(rotation_angle) + rotation_matrix = np.array( + [[cosval, sinval, 0], [-sinval, cosval, 0], [0, 0, 1]] + ) + ctr = points.mean(axis=0) + rotated_data = np.dot(points - ctr, rotation_matrix) + ctr + return rotated_data, rotation_matrix + + +def rotate_pc_along_y(pc, rot_angle): + """Input ps is NxC points with first 3 channels as XYZ + z is facing forward, x is left ward, y is downward + """ + cosval = np.cos(rot_angle) + sinval = np.sin(rot_angle) + rotmat = np.array([[cosval, -sinval], [sinval, cosval]]) + pc[:, [0, 2]] = np.dot(pc[:, [0, 2]], np.transpose(rotmat)) + return pc + + +def roty(t): + """Rotation about the y-axis.""" + c = np.cos(t) + s = np.sin(t) + return np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]]) + + +def roty_batch(t): + """Rotation about the y-axis. + t: (x1,x2,...xn) + return: (x1,x2,...,xn,3,3) + """ + input_shape = t.shape + output = np.zeros(tuple(list(input_shape) + [3, 3])) + c = np.cos(t) + s = np.sin(t) + output[..., 0, 0] = c + output[..., 0, 2] = s + output[..., 1, 1] = 1 + output[..., 2, 0] = -s + output[..., 2, 2] = c + return output + + +def rotz(t): + """Rotation about the z-axis.""" + c = np.cos(t) + s = np.sin(t) + return np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]]) + + +# ---------------------------------------- +# BBox +# ---------------------------------------- +def bbox_corner_dist_measure(crnr1, crnr2): + """compute distance between box corners to replace iou + Args: + crnr1, crnr2: Nx3 points of box corners in camera axis (y points down) + output is a scalar between 0 and 1 + """ + + dist = sys.maxsize + for y in range(4): + rows = [(x + y) % 4 for x in range(4)] + [ + 4 + (x + y) % 4 for x in range(4) + ] + d_ = np.linalg.norm(crnr2[rows, :] - crnr1, axis=1).sum() / 8.0 + if d_ < dist: + dist = d_ + + u = sum([np.linalg.norm(x[0, :] - x[6, :]) for x in [crnr1, crnr2]]) / 2.0 + + measure = max(1.0 - dist / u, 0) + print(measure) + + return measure + + +def point_cloud_to_bbox(points): + """Extract the axis aligned box from a pcl or batch of pcls + Args: + points: Nx3 points or BxNx3 + output is 6 dim: xyz pos of center and 3 lengths + """ + which_dim = ( + len(points.shape) - 2 + ) # first dim if a single cloud and second if batch + mn, mx = points.min(which_dim), points.max(which_dim) + lengths = mx - mn + cntr = 0.5 * (mn + mx) + return np.concatenate([cntr, lengths], axis=which_dim) + + +def write_bbox(scene_bbox, out_filename): + """Export scene bbox to meshes + Args: + scene_bbox: (N x 6 numpy array): xyz pos of center and 3 lengths + out_filename: (string) filename + + Note: + To visualize the boxes in MeshLab. + 1. Select the objects (the boxes) + 2. Filters -> Polygon and Quad Mesh -> Turn into Quad-Dominant Mesh + 3. Select Wireframe view. + """ + + def convert_box_to_trimesh_fmt(box): + ctr = box[:3] + lengths = box[3:] + trns = np.eye(4) + trns[0:3, 3] = ctr + trns[3, 3] = 1.0 + box_trimesh_fmt = trimesh.creation.box(lengths, trns) + return box_trimesh_fmt + + scene = trimesh.scene.Scene() + for box in scene_bbox: + scene.add_geometry(convert_box_to_trimesh_fmt(box)) + + mesh_list = trimesh.util.concatenate(scene.dump()) + # save to ply file + trimesh.io.export.export_mesh(mesh_list, out_filename, file_type="ply") + + return + + +def write_oriented_bbox(scene_bbox, out_filename): + """Export oriented (around Z axis) scene bbox to meshes + Args: + scene_bbox: (N x 7 numpy array): xyz pos of center and 3 lengths (dx,dy,dz) + and heading angle around Z axis. + Y forward, X right, Z upward. heading angle of positive X is 0, + heading angle of positive Y is 90 degrees. + out_filename: (string) filename + """ + + def heading2rotmat(heading_angle): + pass + rotmat = np.zeros((3, 3)) + rotmat[2, 2] = 1 + cosval = np.cos(heading_angle) + sinval = np.sin(heading_angle) + rotmat[0:2, 0:2] = np.array([[cosval, -sinval], [sinval, cosval]]) + return rotmat + + def convert_oriented_box_to_trimesh_fmt(box): + ctr = box[:3] + lengths = box[3:6] + trns = np.eye(4) + trns[0:3, 3] = ctr + trns[3, 3] = 1.0 + trns[0:3, 0:3] = heading2rotmat(box[6]) + box_trimesh_fmt = trimesh.creation.box(lengths, trns) + return box_trimesh_fmt + + scene = trimesh.scene.Scene() + for box in scene_bbox: + scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box)) + + mesh_list = trimesh.util.concatenate(scene.dump()) + # save to ply file + trimesh.io.export.export_mesh(mesh_list, out_filename, file_type="ply") + + return + + +def write_oriented_bbox_camera_coord(scene_bbox, out_filename): + """Export oriented (around Y axis) scene bbox to meshes + Args: + scene_bbox: (N x 7 numpy array): xyz pos of center and 3 lengths (dx,dy,dz) + and heading angle around Y axis. + Z forward, X rightward, Y downward. heading angle of positive X is 0, + heading angle of negative Z is 90 degrees. + out_filename: (string) filename + """ + + def heading2rotmat(heading_angle): + pass + rotmat = np.zeros((3, 3)) + rotmat[1, 1] = 1 + cosval = np.cos(heading_angle) + sinval = np.sin(heading_angle) + rotmat[0, :] = np.array([cosval, 0, sinval]) + rotmat[2, :] = np.array([-sinval, 0, cosval]) + return rotmat + + def convert_oriented_box_to_trimesh_fmt(box): + ctr = box[:3] + lengths = box[3:6] + trns = np.eye(4) + trns[0:3, 3] = ctr + trns[3, 3] = 1.0 + trns[0:3, 0:3] = heading2rotmat(box[6]) + box_trimesh_fmt = trimesh.creation.box(lengths, trns) + return box_trimesh_fmt + + scene = trimesh.scene.Scene() + for box in scene_bbox: + scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box)) + + mesh_list = trimesh.util.concatenate(scene.dump()) + # save to ply file + trimesh.io.export.export_mesh(mesh_list, out_filename, file_type="ply") + + return + + +def write_lines_as_cylinders(pcl, filename, rad=0.005, res=64): + """Create lines represented as cylinders connecting pairs of 3D points + Args: + pcl: (N x 2 x 3 numpy array): N pairs of xyz pos + filename: (string) filename for the output mesh (ply) file + rad: radius for the cylinder + res: number of sections used to create the cylinder + """ + scene = trimesh.scene.Scene() + for src, tgt in pcl: + # compute line + vec = tgt - src + M = trimesh.geometry.align_vectors([0, 0, 1], vec, False) + vec = ( + tgt - src + ) # compute again since align_vectors modifies vec in-place! + M[:3, 3] = 0.5 * src + 0.5 * tgt + height = np.sqrt(np.dot(vec, vec)) + scene.add_geometry( + trimesh.creation.cylinder( + radius=rad, height=height, sections=res, transform=M + ) + ) + mesh_list = trimesh.util.concatenate(scene.dump()) + trimesh.io.export.export_mesh( + mesh_list, "%s.ply" % (filename), file_type="ply" + ) + + +# ---------------------------------------- +# Testing +# ---------------------------------------- +if __name__ == "__main__": + print("running some tests") + + ############ + ## Test "write_lines_as_cylinders" + ############ + pcl = np.random.rand(32, 2, 3) + write_lines_as_cylinders(pcl, "point_connectors") + input() + + scene_bbox = np.zeros((1, 7)) + scene_bbox[0, 3:6] = np.array([1, 2, 3]) # dx,dy,dz + scene_bbox[0, 6] = np.pi / 4 # 45 degrees + write_oriented_bbox(scene_bbox, "single_obb_45degree.ply") + ############ + ## Test point_cloud_to_bbox + ############ + pcl = np.random.rand(32, 16, 3) + pcl_bbox = point_cloud_to_bbox(pcl) + assert pcl_bbox.shape == (32, 6) + + pcl = np.random.rand(16, 3) + pcl_bbox = point_cloud_to_bbox(pcl) + assert pcl_bbox.shape == (6,) + + ############ + ## Test corner distance + ############ + crnr1 = np.array( + [ + [2.59038660e00, 8.96107932e-01, 4.73305349e00], + [4.12281644e-01, 8.96107932e-01, 4.48046631e00], + [2.97129656e-01, 8.96107932e-01, 5.47344275e00], + [2.47523462e00, 8.96107932e-01, 5.72602993e00], + [2.59038660e00, 4.41155793e-03, 4.73305349e00], + [4.12281644e-01, 4.41155793e-03, 4.48046631e00], + [2.97129656e-01, 4.41155793e-03, 5.47344275e00], + [2.47523462e00, 4.41155793e-03, 5.72602993e00], + ] + ) + crnr2 = crnr1 + + print(bbox_corner_dist_measure(crnr1, crnr2)) + + print("tests PASSED") diff --git a/models/Mask3D/mask3d/utils/votenet_utils/tf_logger.py b/models/Mask3D/mask3d/utils/votenet_utils/tf_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..2d12e719d1fb189ae60b2c06ab17a143304d73d3 --- /dev/null +++ b/models/Mask3D/mask3d/utils/votenet_utils/tf_logger.py @@ -0,0 +1,81 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import tensorflow as tf +import numpy as np +import scipy.misc + +try: + from StringIO import StringIO # Python 2.7 +except ImportError: + from io import BytesIO # Python 3.x + + +class Logger(object): + def __init__(self, log_dir): + """Create a summary writer logging to log_dir.""" + self.writer = tf.summary.FileWriter(log_dir) + + def scalar_summary(self, tag, value, step): + """Log a scalar variable.""" + summary = tf.Summary( + value=[tf.Summary.Value(tag=tag, simple_value=value)] + ) + self.writer.add_summary(summary, step) + + def image_summary(self, tag, images, step): + """Log a list of images.""" + + img_summaries = [] + for i, img in enumerate(images): + # Write the image to a string + try: + s = StringIO() + except: + s = BytesIO() + scipy.misc.toimage(img).save(s, format="png") + + # Create an Image object + img_sum = tf.Summary.Image( + encoded_image_string=s.getvalue(), + height=img.shape[0], + width=img.shape[1], + ) + # Create a Summary value + img_summaries.append( + tf.Summary.Value(tag="%s/%d" % (tag, i), image=img_sum) + ) + + # Create and write Summary + summary = tf.Summary(value=img_summaries) + self.writer.add_summary(summary, step) + + def histo_summary(self, tag, values, step, bins=1000): + """Log a histogram of the tensor of values.""" + + # Create a histogram using numpy + counts, bin_edges = np.histogram(values, bins=bins) + + # Fill the fields of the histogram proto + hist = tf.HistogramProto() + hist.min = float(np.min(values)) + hist.max = float(np.max(values)) + hist.num = int(np.prod(values.shape)) + hist.sum = float(np.sum(values)) + hist.sum_squares = float(np.sum(values**2)) + + # Drop the start of the first bin + bin_edges = bin_edges[1:] + + # Add bin edges and counts + for edge in bin_edges: + hist.bucket_limit.append(edge) + for c in counts: + hist.bucket.append(c) + + # Create and write Summary + summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) + self.writer.add_summary(summary, step) + self.writer.flush() diff --git a/models/Mask3D/mask3d/utils/votenet_utils/tf_visualizer.py b/models/Mask3D/mask3d/utils/votenet_utils/tf_visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..b0795e6077694f344bb79496fd9133fa79ddee66 --- /dev/null +++ b/models/Mask3D/mask3d/utils/votenet_utils/tf_visualizer.py @@ -0,0 +1,52 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +"""Code adapted from https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix""" +import os +import time + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +import sys + +sys.path.append(BASE_DIR) +import tf_logger + + +class Visualizer: + def __init__(self, opt, name="train"): + # self.opt = opt + # self.logger = tf_logger.Logger(os.path.join(opt.logging_dir, opt.name)) + # self.log_name = os.path.join(opt.checkpoint_dir, opt.name, 'loss_log.txt') + self.logger = tf_logger.Logger(os.path.join(opt.log_dir, name)) + self.log_name = os.path.join(opt.log_dir, "tf_visualizer_log.txt") + with open(self.log_name, "a") as log_file: + now = time.strftime("%c") + log_file.write( + "================ Training Loss (%s) ================\n" % now + ) + + # |visuals|: dictionary of images to save + def log_images(self, visuals, step): + for label, image_numpy in visuals.items(): + self.logger.image_summary(label, [image_numpy], step) + + # scalars: dictionary of scalar labels and values + def log_scalars(self, scalars, step): + for label, val in scalars.items(): + self.logger.scalar_summary(label, val, step) + + # scatter plots + def plot_current_points(self, points, disp_offset=10): + pass + + # scalars: same format as |scalars| of plot_current_scalars + def print_current_scalars(self, epoch, i, scalars): + message = "(epoch: %d, iters: %d) " % (epoch, i) + for k, v in scalars.items(): + message += "%s: %.3f " % (k, v) + + print(message) + with open(self.log_name, "a") as log_file: + log_file.write("%s\n" % message) diff --git a/models/Mask3D/setup.py b/models/Mask3D/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..06b2008c6805d451762930df34e3eddf32fe3bcc --- /dev/null +++ b/models/Mask3D/setup.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import platform +import shutil +import sys +import warnings +from setuptools import find_packages, setup + +setup( + name="mask3d", + version="0.1", # Consider using semantic versioning + packages=find_packages(), + package_data={"": ["*.yaml"]}, + install_requires=[ + # List your dependencies here, e.g., + # 'numpy', + # 'pandas', + ], + include_package_data=True, + # zip_safe=False, +) diff --git a/models/Mask3D/test.py b/models/Mask3D/test.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/__init__.py b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/_ext.cpython-310-x86_64-linux-gnu.so b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/_ext.cpython-310-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..4d12ddb8ced11fc154ea87487dc795f7d50808a7 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/_ext.cpython-310-x86_64-linux-gnu.so differ diff --git a/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pointnet2_modules.py b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pointnet2_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..2e82cdc249bd2a6cd8e87940a2103ce4438908d8 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pointnet2_modules.py @@ -0,0 +1,581 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +""" Pointnet2 layers. +Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch +Extended with the following: +1. Uniform sampling in each local region (sample_uniformly) +2. Return sampled points indices to support votenet. +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +import os +import sys + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) + +import pointnet2_utils +import pytorch_utils as pt_utils +from typing import List + + +class _PointnetSAModuleBase(nn.Module): + def __init__(self): + super().__init__() + self.npoint = None + self.groupers = None + self.mlps = None + + def forward( + self, xyz: torch.Tensor, features: torch.Tensor = None + ) -> (torch.Tensor, torch.Tensor): + r""" + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor of the xyz coordinates of the features + features : torch.Tensor + (B, N, C) tensor of the descriptors of the the features + + Returns + ------- + new_xyz : torch.Tensor + (B, npoint, 3) tensor of the new features' xyz + new_features : torch.Tensor + (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors + """ + + new_features_list = [] + + xyz_flipped = xyz.transpose(1, 2).contiguous() + new_xyz = ( + pointnet2_utils.gather_operation( + xyz_flipped, + pointnet2_utils.furthest_point_sample(xyz, self.npoint), + ) + .transpose(1, 2) + .contiguous() + if self.npoint is not None + else None + ) + + for i in range(len(self.groupers)): + new_features = self.groupers[i]( + xyz, new_xyz, features + ) # (B, C, npoint, nsample) + + new_features = self.mlps[i]( + new_features + ) # (B, mlp[-1], npoint, nsample) + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) + + new_features_list.append(new_features) + + return new_xyz, torch.cat(new_features_list, dim=1) + + +class PointnetSAModuleMSG(_PointnetSAModuleBase): + r"""Pointnet set abstrction layer with multiscale grouping + + Parameters + ---------- + npoint : int + Number of features + radii : list of float32 + list of radii to group with + nsamples : list of int32 + Number of samples in each ball query + mlps : list of list of int32 + Spec of the pointnet before the global max_pool for each scale + bn : bool + Use batchnorm + """ + + def __init__( + self, + *, + npoint: int, + radii: List[float], + nsamples: List[int], + mlps: List[List[int]], + bn: bool = True, + use_xyz: bool = True, + sample_uniformly: bool = False + ): + super().__init__() + + assert len(radii) == len(nsamples) == len(mlps) + + self.npoint = npoint + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + for i in range(len(radii)): + radius = radii[i] + nsample = nsamples[i] + self.groupers.append( + pointnet2_utils.QueryAndGroup( + radius, + nsample, + use_xyz=use_xyz, + sample_uniformly=sample_uniformly, + ) + if npoint is not None + else pointnet2_utils.GroupAll(use_xyz) + ) + mlp_spec = mlps[i] + if use_xyz: + mlp_spec[0] += 3 + + self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) + + +class PointnetSAModule(PointnetSAModuleMSG): + r"""Pointnet set abstrction layer + + Parameters + ---------- + npoint : int + Number of features + radius : float + Radius of ball + nsample : int + Number of samples in the ball query + mlp : list + Spec of the pointnet before the global max_pool + bn : bool + Use batchnorm + """ + + def __init__( + self, + *, + mlp: List[int], + npoint: int = None, + radius: float = None, + nsample: int = None, + bn: bool = True, + use_xyz: bool = True + ): + super().__init__( + mlps=[mlp], + npoint=npoint, + radii=[radius], + nsamples=[nsample], + bn=bn, + use_xyz=use_xyz, + ) + + +class PointnetSAModuleVotes(nn.Module): + """Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG + with extra support for returning point indices for getting their GT votes""" + + def __init__( + self, + *, + mlp: List[int], + npoint: int = None, + radius: float = None, + nsample: int = None, + bn: bool = True, + use_xyz: bool = True, + pooling: str = "max", + sigma: float = None, # for RBF pooling + normalize_xyz: bool = False, # noramlize local XYZ with radius + sample_uniformly: bool = False, + ret_unique_cnt: bool = False + ): + super().__init__() + self.npoint = npoint + self.radius = radius + self.nsample = nsample + self.pooling = pooling + self.mlp_module = None + self.use_xyz = use_xyz + self.sigma = sigma + if self.sigma is None: + self.sigma = self.radius / 2 + self.normalize_xyz = normalize_xyz + self.ret_unique_cnt = ret_unique_cnt + + if npoint is not None: + self.grouper = pointnet2_utils.QueryAndGroup( + radius, + nsample, + use_xyz=use_xyz, + ret_grouped_xyz=True, + normalize_xyz=normalize_xyz, + sample_uniformly=sample_uniformly, + ret_unique_cnt=ret_unique_cnt, + ) + else: + self.grouper = pointnet2_utils.GroupAll( + use_xyz, ret_grouped_xyz=True + ) + + mlp_spec = mlp + if use_xyz and len(mlp_spec) > 0: + mlp_spec[0] += 3 + self.mlp_module = pt_utils.SharedMLP(mlp_spec, bn=bn) + + def forward( + self, + xyz: torch.Tensor, + features: torch.Tensor = None, + inds: torch.Tensor = None, + ) -> (torch.Tensor, torch.Tensor): + r""" + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor of the xyz coordinates of the features + features : torch.Tensor + (B, C, N) tensor of the descriptors of the the features + inds : torch.Tensor + (B, npoint) tensor that stores index to the xyz points (values in 0-N-1) + + Returns + ------- + new_xyz : torch.Tensor + (B, npoint, 3) tensor of the new features' xyz + new_features : torch.Tensor + (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors + inds: torch.Tensor + (B, npoint) tensor of the inds + """ + + xyz_flipped = xyz.transpose(1, 2).contiguous() + if inds is None: + inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint) + else: + assert inds.shape[1] == self.npoint + new_xyz = ( + pointnet2_utils.gather_operation(xyz_flipped, inds) + .transpose(1, 2) + .contiguous() + if self.npoint is not None + else None + ) + + if not self.ret_unique_cnt: + grouped_features, grouped_xyz = self.grouper( + xyz, new_xyz, features + ) # (B, C, npoint, nsample) + else: + grouped_features, grouped_xyz, unique_cnt = self.grouper( + xyz, new_xyz, features + ) # (B, C, npoint, nsample), (B,3,npoint,nsample), (B,npoint) + + new_features = self.mlp_module( + grouped_features + ) # (B, mlp[-1], npoint, nsample) + if self.pooling == "max": + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + elif self.pooling == "avg": + new_features = F.avg_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + elif self.pooling == "rbf": + # Use radial basis function kernel for weighted sum of features (normalized by nsample and sigma) + # Ref: https://en.wikipedia.org/wiki/Radial_basis_function_kernel + rbf = torch.exp( + -1 + * grouped_xyz.pow(2).sum(1, keepdim=False) + / (self.sigma**2) + / 2 + ) # (B, npoint, nsample) + new_features = torch.sum( + new_features * rbf.unsqueeze(1), -1, keepdim=True + ) / float( + self.nsample + ) # (B, mlp[-1], npoint, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) + + if not self.ret_unique_cnt: + return new_xyz, new_features, inds + else: + return new_xyz, new_features, inds, unique_cnt + + +class PointnetSAModuleMSGVotes(nn.Module): + """Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG + with extra support for returning point indices for getting their GT votes""" + + def __init__( + self, + *, + mlps: List[List[int]], + npoint: int, + radii: List[float], + nsamples: List[int], + bn: bool = True, + use_xyz: bool = True, + sample_uniformly: bool = False + ): + super().__init__() + + assert len(mlps) == len(nsamples) == len(radii) + + self.npoint = npoint + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + for i in range(len(radii)): + radius = radii[i] + nsample = nsamples[i] + self.groupers.append( + pointnet2_utils.QueryAndGroup( + radius, + nsample, + use_xyz=use_xyz, + sample_uniformly=sample_uniformly, + ) + if npoint is not None + else pointnet2_utils.GroupAll(use_xyz) + ) + mlp_spec = mlps[i] + if use_xyz: + mlp_spec[0] += 3 + + self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) + + def forward( + self, + xyz: torch.Tensor, + features: torch.Tensor = None, + inds: torch.Tensor = None, + ) -> (torch.Tensor, torch.Tensor): + r""" + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor of the xyz coordinates of the features + features : torch.Tensor + (B, C, C) tensor of the descriptors of the the features + inds : torch.Tensor + (B, npoint) tensor that stores index to the xyz points (values in 0-N-1) + + Returns + ------- + new_xyz : torch.Tensor + (B, npoint, 3) tensor of the new features' xyz + new_features : torch.Tensor + (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors + inds: torch.Tensor + (B, npoint) tensor of the inds + """ + new_features_list = [] + + xyz_flipped = xyz.transpose(1, 2).contiguous() + if inds is None: + inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint) + new_xyz = ( + pointnet2_utils.gather_operation(xyz_flipped, inds) + .transpose(1, 2) + .contiguous() + if self.npoint is not None + else None + ) + + for i in range(len(self.groupers)): + new_features = self.groupers[i]( + xyz, new_xyz, features + ) # (B, C, npoint, nsample) + new_features = self.mlps[i]( + new_features + ) # (B, mlp[-1], npoint, nsample) + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) + + new_features_list.append(new_features) + + return new_xyz, torch.cat(new_features_list, dim=1), inds + + +class PointnetFPModule(nn.Module): + r"""Propigates the features of one set to another + + Parameters + ---------- + mlp : list + Pointnet module parameters + bn : bool + Use batchnorm + """ + + def __init__(self, *, mlp: List[int], bn: bool = True): + super().__init__() + self.mlp = pt_utils.SharedMLP(mlp, bn=bn) + + def forward( + self, + unknown: torch.Tensor, + known: torch.Tensor, + unknow_feats: torch.Tensor, + known_feats: torch.Tensor, + ) -> torch.Tensor: + r""" + Parameters + ---------- + unknown : torch.Tensor + (B, n, 3) tensor of the xyz positions of the unknown features + known : torch.Tensor + (B, m, 3) tensor of the xyz positions of the known features + unknow_feats : torch.Tensor + (B, C1, n) tensor of the features to be propigated to + known_feats : torch.Tensor + (B, C2, m) tensor of features to be propigated + + Returns + ------- + new_features : torch.Tensor + (B, mlp[-1], n) tensor of the features of the unknown features + """ + + if known is not None: + dist, idx = pointnet2_utils.three_nn(unknown, known) + dist_recip = 1.0 / (dist + 1e-8) + norm = torch.sum(dist_recip, dim=2, keepdim=True) + weight = dist_recip / norm + + interpolated_feats = pointnet2_utils.three_interpolate( + known_feats, idx, weight + ) + else: + interpolated_feats = known_feats.expand( + *known_feats.size()[0:2], unknown.size(1) + ) + + if unknow_feats is not None: + new_features = torch.cat( + [interpolated_feats, unknow_feats], dim=1 + ) # (B, C2 + C1, n) + else: + new_features = interpolated_feats + + new_features = new_features.unsqueeze(-1) + new_features = self.mlp(new_features) + + return new_features.squeeze(-1) + + +class PointnetLFPModuleMSG(nn.Module): + """Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG + learnable feature propagation layer.""" + + def __init__( + self, + *, + mlps: List[List[int]], + radii: List[float], + nsamples: List[int], + post_mlp: List[int], + bn: bool = True, + use_xyz: bool = True, + sample_uniformly: bool = False + ): + super().__init__() + + assert len(mlps) == len(nsamples) == len(radii) + + self.post_mlp = pt_utils.SharedMLP(post_mlp, bn=bn) + + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + for i in range(len(radii)): + radius = radii[i] + nsample = nsamples[i] + self.groupers.append( + pointnet2_utils.QueryAndGroup( + radius, + nsample, + use_xyz=use_xyz, + sample_uniformly=sample_uniformly, + ) + ) + mlp_spec = mlps[i] + if use_xyz: + mlp_spec[0] += 3 + + self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) + + def forward( + self, + xyz2: torch.Tensor, + xyz1: torch.Tensor, + features2: torch.Tensor, + features1: torch.Tensor, + ) -> torch.Tensor: + r"""Propagate features from xyz1 to xyz2. + Parameters + ---------- + xyz2 : torch.Tensor + (B, N2, 3) tensor of the xyz coordinates of the features + xyz1 : torch.Tensor + (B, N1, 3) tensor of the xyz coordinates of the features + features2 : torch.Tensor + (B, C2, N2) tensor of the descriptors of the the features + features1 : torch.Tensor + (B, C1, N1) tensor of the descriptors of the the features + + Returns + ------- + new_features1 : torch.Tensor + (B, \sum_k(mlps[k][-1]), N1) tensor of the new_features descriptors + """ + new_features_list = [] + + for i in range(len(self.groupers)): + new_features = self.groupers[i]( + xyz1, xyz2, features1 + ) # (B, C1, N2, nsample) + new_features = self.mlps[i]( + new_features + ) # (B, mlp[-1], N2, nsample) + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], N2, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], N2) + + if features2 is not None: + new_features = torch.cat( + [new_features, features2], dim=1 + ) # (B, mlp[-1] + C2, N2) + + new_features = new_features.unsqueeze(-1) + new_features = self.post_mlp(new_features) + + new_features_list.append(new_features) + + return torch.cat(new_features_list, dim=1).squeeze(-1) + + +if __name__ == "__main__": + from torch.autograd import Variable + + torch.manual_seed(1) + torch.cuda.manual_seed_all(1) + xyz = Variable(torch.randn(2, 9, 3).cuda(), requires_grad=True) + xyz_feats = Variable(torch.randn(2, 9, 6).cuda(), requires_grad=True) + + test_module = PointnetSAModuleMSG( + npoint=2, radii=[5.0, 10.0], nsamples=[6, 3], mlps=[[9, 3], [9, 6]] + ) + test_module.cuda() + print(test_module(xyz, xyz_feats)) + + for _ in range(1): + _, new_features = test_module(xyz, xyz_feats) + new_features.backward( + torch.cuda.FloatTensor(*new_features.size()).fill_(1) + ) + print(new_features) + print(xyz.grad) diff --git a/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pointnet2_test.py b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pointnet2_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0994ff56ccbb1e9c97316ba7125b164c34f0d66b --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pointnet2_test.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +""" Testing customized ops. """ + +import torch +from torch.autograd import gradcheck +import numpy as np + +import os +import sys + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) +import pointnet2_utils + + +def test_interpolation_grad(): + batch_size = 1 + feat_dim = 2 + m = 4 + feats = ( + torch.randn(batch_size, feat_dim, m, requires_grad=True).float().cuda() + ) + + def interpolate_func(inputs): + idx = torch.from_numpy(np.array([[[0, 1, 2], [1, 2, 3]]])).int().cuda() + weight = ( + torch.from_numpy(np.array([[[1, 1, 1], [2, 2, 2]]])).float().cuda() + ) + interpolated_feats = pointnet2_utils.three_interpolate( + inputs, idx, weight + ) + return interpolated_feats + + assert gradcheck(interpolate_func, feats, atol=1e-1, rtol=1e-1) + + +if __name__ == "__main__": + test_interpolation_grad() diff --git a/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pointnet2_utils.py b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pointnet2_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..24be3136b1c465b10599393b12a92bcb4ee96e8f --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pointnet2_utils.py @@ -0,0 +1,438 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +""" Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch """ +from __future__ import ( + division, + absolute_import, + with_statement, + print_function, + unicode_literals, +) +import torch +from torch.autograd import Function +import torch.nn as nn +import pointnet2.pytorch_utils as pt_utils +import sys + +try: + import builtins +except: + import __builtin__ as builtins + +try: + import pointnet2._ext as _ext +except ImportError: + if not getattr(builtins, "__POINTNET2_SETUP__", False): + raise ImportError( + "Could not import _ext module.\n" + "Please see the setup instructions in the README: " + "https://github.com/erikwijmans/Pointnet2_PyTorch/blob/master/README.rst" + ) + +if False: + # Workaround for type hints without depending on the `typing` module + from typing import * + + +class RandomDropout(nn.Module): + def __init__(self, p=0.5, inplace=False): + super(RandomDropout, self).__init__() + self.p = p + self.inplace = inplace + + def forward(self, X): + theta = torch.Tensor(1).uniform_(0, self.p)[0] + return pt_utils.feature_dropout_no_scaling( + X, theta, self.train, self.inplace + ) + + +class FurthestPointSampling(Function): + @staticmethod + def forward(ctx, xyz, npoint): + # type: (Any, torch.Tensor, int) -> torch.Tensor + r""" + Uses iterative furthest point sampling to select a set of npoint features that have the largest + minimum distance + + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor where N > npoint + npoint : int32 + number of features in the sampled set + + Returns + ------- + torch.Tensor + (B, npoint) tensor containing the set + """ + fps_inds = _ext.furthest_point_sampling(xyz, npoint) + ctx.mark_non_differentiable(fps_inds) + return fps_inds + + @staticmethod + def backward(xyz, a=None): + return None, None + + +furthest_point_sample = FurthestPointSampling.apply + + +class GatherOperation(Function): + @staticmethod + def forward(ctx, features, idx): + # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + features : torch.Tensor + (B, C, N) tensor + + idx : torch.Tensor + (B, npoint) tensor of the features to gather + + Returns + ------- + torch.Tensor + (B, C, npoint) tensor + """ + + _, C, N = features.size() + + ctx.for_backwards = (idx, C, N) + + return _ext.gather_points(features, idx) + + @staticmethod + def backward(ctx, grad_out): + idx, C, N = ctx.for_backwards + + grad_features = _ext.gather_points_grad(grad_out.contiguous(), idx, N) + return grad_features, None + + +gather_operation = GatherOperation.apply + + +class ThreeNN(Function): + @staticmethod + def forward(ctx, unknown, known): + # type: (Any, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor] + r""" + Find the three nearest neighbors of unknown in known + Parameters + ---------- + unknown : torch.Tensor + (B, n, 3) tensor of known features + known : torch.Tensor + (B, m, 3) tensor of unknown features + + Returns + ------- + dist : torch.Tensor + (B, n, 3) l2 distance to the three nearest neighbors + idx : torch.Tensor + (B, n, 3) index of 3 nearest neighbors + """ + dist2, idx = _ext.three_nn(unknown, known) + + return torch.sqrt(dist2), idx + + @staticmethod + def backward(ctx, a=None, b=None): + return None, None + + +three_nn = ThreeNN.apply + + +class ThreeInterpolate(Function): + @staticmethod + def forward(ctx, features, idx, weight): + # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor + r""" + Performs weight linear interpolation on 3 features + Parameters + ---------- + features : torch.Tensor + (B, c, m) Features descriptors to be interpolated from + idx : torch.Tensor + (B, n, 3) three nearest neighbors of the target features in features + weight : torch.Tensor + (B, n, 3) weights + + Returns + ------- + torch.Tensor + (B, c, n) tensor of the interpolated features + """ + B, c, m = features.size() + n = idx.size(1) + + ctx.three_interpolate_for_backward = (idx, weight, m) + + return _ext.three_interpolate(features, idx, weight) + + @staticmethod + def backward(ctx, grad_out): + # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + r""" + Parameters + ---------- + grad_out : torch.Tensor + (B, c, n) tensor with gradients of ouputs + + Returns + ------- + grad_features : torch.Tensor + (B, c, m) tensor with gradients of features + + None + + None + """ + idx, weight, m = ctx.three_interpolate_for_backward + + grad_features = _ext.three_interpolate_grad( + grad_out.contiguous(), idx, weight, m + ) + + return grad_features, None, None + + +three_interpolate = ThreeInterpolate.apply + + +class GroupingOperation(Function): + @staticmethod + def forward(ctx, features, idx): + # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + features : torch.Tensor + (B, C, N) tensor of features to group + idx : torch.Tensor + (B, npoint, nsample) tensor containing the indicies of features to group with + + Returns + ------- + torch.Tensor + (B, C, npoint, nsample) tensor + """ + B, nfeatures, nsample = idx.size() + _, C, N = features.size() + + ctx.for_backwards = (idx, N) + + return _ext.group_points(features, idx) + + @staticmethod + def backward(ctx, grad_out): + # type: (Any, torch.tensor) -> Tuple[torch.Tensor, torch.Tensor] + r""" + + Parameters + ---------- + grad_out : torch.Tensor + (B, C, npoint, nsample) tensor of the gradients of the output from forward + + Returns + ------- + torch.Tensor + (B, C, N) gradient of the features + None + """ + idx, N = ctx.for_backwards + + grad_features = _ext.group_points_grad(grad_out.contiguous(), idx, N) + + return grad_features, None + + +grouping_operation = GroupingOperation.apply + + +class BallQuery(Function): + @staticmethod + def forward(ctx, radius, nsample, xyz, new_xyz): + # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + radius : float + radius of the balls + nsample : int + maximum number of features in the balls + xyz : torch.Tensor + (B, N, 3) xyz coordinates of the features + new_xyz : torch.Tensor + (B, npoint, 3) centers of the ball query + + Returns + ------- + torch.Tensor + (B, npoint, nsample) tensor with the indicies of the features that form the query balls + """ + inds = _ext.ball_query(new_xyz, xyz, radius, nsample) + ctx.mark_non_differentiable(inds) + return inds + + @staticmethod + def backward(ctx, a=None): + return None, None, None, None + + +ball_query = BallQuery.apply + + +class QueryAndGroup(nn.Module): + r""" + Groups with a ball query of radius + + Parameters + --------- + radius : float32 + Radius of ball + nsample : int32 + Maximum number of features to gather in the ball + """ + + def __init__( + self, + radius, + nsample, + use_xyz=True, + ret_grouped_xyz=False, + normalize_xyz=False, + sample_uniformly=False, + ret_unique_cnt=False, + ): + super(QueryAndGroup, self).__init__() + self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz + self.ret_grouped_xyz = ret_grouped_xyz + self.normalize_xyz = normalize_xyz + self.sample_uniformly = sample_uniformly + self.ret_unique_cnt = ret_unique_cnt + if self.ret_unique_cnt: + assert self.sample_uniformly + + def forward(self, xyz, new_xyz, features=None): + # type: (QueryAndGroup, torch.Tensor. torch.Tensor, torch.Tensor) -> Tuple[Torch.Tensor] + r""" + Parameters + ---------- + xyz : torch.Tensor + xyz coordinates of the features (B, N, 3) + new_xyz : torch.Tensor + centriods (B, npoint, 3) + features : torch.Tensor + Descriptors of the features (B, C, N) + + Returns + ------- + new_features : torch.Tensor + (B, 3 + C, npoint, nsample) tensor + """ + idx = ball_query(self.radius, self.nsample, xyz, new_xyz) + + if self.sample_uniformly: + unique_cnt = torch.zeros((idx.shape[0], idx.shape[1])) + for i_batch in range(idx.shape[0]): + for i_region in range(idx.shape[1]): + unique_ind = torch.unique(idx[i_batch, i_region, :]) + num_unique = unique_ind.shape[0] + unique_cnt[i_batch, i_region] = num_unique + sample_ind = torch.randint( + 0, + num_unique, + (self.nsample - num_unique,), + dtype=torch.long, + ) + all_ind = torch.cat((unique_ind, unique_ind[sample_ind])) + idx[i_batch, i_region, :] = all_ind + + xyz_trans = xyz.transpose(1, 2).contiguous() + grouped_xyz = grouping_operation( + xyz_trans, idx + ) # (B, 3, npoint, nsample) + grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1) + if self.normalize_xyz: + grouped_xyz /= self.radius + + if features is not None: + grouped_features = grouping_operation(features, idx) + if self.use_xyz: + new_features = torch.cat( + [grouped_xyz, grouped_features], dim=1 + ) # (B, C + 3, npoint, nsample) + else: + new_features = grouped_features + else: + assert ( + self.use_xyz + ), "Cannot have not features and not use xyz as a feature!" + new_features = grouped_xyz + + ret = [new_features] + if self.ret_grouped_xyz: + ret.append(grouped_xyz) + if self.ret_unique_cnt: + ret.append(unique_cnt) + if len(ret) == 1: + return ret[0] + else: + return tuple(ret) + + +class GroupAll(nn.Module): + r""" + Groups all features + + Parameters + --------- + """ + + def __init__(self, use_xyz=True, ret_grouped_xyz=False): + # type: (GroupAll, bool) -> None + super(GroupAll, self).__init__() + self.use_xyz = use_xyz + + def forward(self, xyz, new_xyz, features=None): + # type: (GroupAll, torch.Tensor, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor] + r""" + Parameters + ---------- + xyz : torch.Tensor + xyz coordinates of the features (B, N, 3) + new_xyz : torch.Tensor + Ignored + features : torch.Tensor + Descriptors of the features (B, C, N) + + Returns + ------- + new_features : torch.Tensor + (B, C + 3, 1, N) tensor + """ + + grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) + if features is not None: + grouped_features = features.unsqueeze(2) + if self.use_xyz: + new_features = torch.cat( + [grouped_xyz, grouped_features], dim=1 + ) # (B, 3 + C, 1, N) + else: + new_features = grouped_features + else: + new_features = grouped_xyz + + if self.ret_grouped_xyz: + return new_features, grouped_xyz + else: + return new_features diff --git a/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pytorch_utils.py b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pytorch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..59ece654dcda0cc3c0bb25c84f63bd06563dcfcd --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/build/lib.linux-x86_64-cpython-310/pointnet2/pytorch_utils.py @@ -0,0 +1,283 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +""" Modified based on Ref: https://github.com/erikwijmans/Pointnet2_PyTorch """ +import torch +import torch.nn as nn +from typing import List, Tuple + + +class SharedMLP(nn.Sequential): + def __init__( + self, + args: List[int], + *, + bn: bool = False, + activation=nn.ReLU(inplace=True), + preact: bool = False, + first: bool = False, + name: str = "" + ): + super().__init__() + + for i in range(len(args) - 1): + self.add_module( + name + "layer{}".format(i), + Conv2d( + args[i], + args[i + 1], + bn=(not first or not preact or (i != 0)) and bn, + activation=activation + if (not first or not preact or (i != 0)) + else None, + preact=preact, + ), + ) + + +class _BNBase(nn.Sequential): + def __init__(self, in_size, batch_norm=None, name=""): + super().__init__() + self.add_module(name + "bn", batch_norm(in_size)) + + nn.init.constant_(self[0].weight, 1.0) + nn.init.constant_(self[0].bias, 0) + + +class BatchNorm1d(_BNBase): + def __init__(self, in_size: int, *, name: str = ""): + super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name) + + +class BatchNorm2d(_BNBase): + def __init__(self, in_size: int, name: str = ""): + super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name) + + +class BatchNorm3d(_BNBase): + def __init__(self, in_size: int, name: str = ""): + super().__init__(in_size, batch_norm=nn.BatchNorm3d, name=name) + + +class _ConvBase(nn.Sequential): + def __init__( + self, + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=None, + batch_norm=None, + bias=True, + preact=False, + name="", + ): + super().__init__() + + bias = bias and (not bn) + conv_unit = conv( + in_size, + out_size, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=bias, + ) + init(conv_unit.weight) + if bias: + nn.init.constant_(conv_unit.bias, 0) + + if bn: + if not preact: + bn_unit = batch_norm(out_size) + else: + bn_unit = batch_norm(in_size) + + if preact: + if bn: + self.add_module(name + "bn", bn_unit) + + if activation is not None: + self.add_module(name + "activation", activation) + + self.add_module(name + "conv", conv_unit) + + if not preact: + if bn: + self.add_module(name + "bn", bn_unit) + + if activation is not None: + self.add_module(name + "activation", activation) + + +class Conv1d(_ConvBase): + def __init__( + self, + in_size: int, + out_size: int, + *, + kernel_size: int = 1, + stride: int = 1, + padding: int = 0, + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=nn.init.kaiming_normal_, + bias: bool = True, + preact: bool = False, + name: str = "" + ): + super().__init__( + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=nn.Conv1d, + batch_norm=BatchNorm1d, + bias=bias, + preact=preact, + name=name, + ) + + +class Conv2d(_ConvBase): + def __init__( + self, + in_size: int, + out_size: int, + *, + kernel_size: Tuple[int, int] = (1, 1), + stride: Tuple[int, int] = (1, 1), + padding: Tuple[int, int] = (0, 0), + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=nn.init.kaiming_normal_, + bias: bool = True, + preact: bool = False, + name: str = "" + ): + super().__init__( + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=nn.Conv2d, + batch_norm=BatchNorm2d, + bias=bias, + preact=preact, + name=name, + ) + + +class Conv3d(_ConvBase): + def __init__( + self, + in_size: int, + out_size: int, + *, + kernel_size: Tuple[int, int, int] = (1, 1, 1), + stride: Tuple[int, int, int] = (1, 1, 1), + padding: Tuple[int, int, int] = (0, 0, 0), + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=nn.init.kaiming_normal_, + bias: bool = True, + preact: bool = False, + name: str = "" + ): + super().__init__( + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=nn.Conv3d, + batch_norm=BatchNorm3d, + bias=bias, + preact=preact, + name=name, + ) + + +class FC(nn.Sequential): + def __init__( + self, + in_size: int, + out_size: int, + *, + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=None, + preact: bool = False, + name: str = "" + ): + super().__init__() + + fc = nn.Linear(in_size, out_size, bias=not bn) + if init is not None: + init(fc.weight) + if not bn: + nn.init.constant_(fc.bias, 0) + + if preact: + if bn: + self.add_module(name + "bn", BatchNorm1d(in_size)) + + if activation is not None: + self.add_module(name + "activation", activation) + + self.add_module(name + "fc", fc) + + if not preact: + if bn: + self.add_module(name + "bn", BatchNorm1d(out_size)) + + if activation is not None: + self.add_module(name + "activation", activation) + + +def set_bn_momentum_default(bn_momentum): + def fn(m): + if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)): + m.momentum = bn_momentum + + return fn + + +class BNMomentumScheduler(object): + def __init__( + self, model, bn_lambda, last_epoch=-1, setter=set_bn_momentum_default + ): + if not isinstance(model, nn.Module): + raise RuntimeError( + "Class '{}' is not a PyTorch nn Module".format( + type(model).__name__ + ) + ) + + self.model = model + self.setter = setter + self.lmbd = bn_lambda + + self.step(last_epoch + 1) + self.last_epoch = last_epoch + + def step(self, epoch=None): + if epoch is None: + epoch = self.last_epoch + 1 + + self.last_epoch = epoch + self.model.apply(self.setter(self.lmbd(epoch))) diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_deps b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_deps new file mode 100644 index 0000000000000000000000000000000000000000..5b475e17b7fcd7d14f3250049b8f720b99f9c651 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_deps differ diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_log b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_log new file mode 100644 index 0000000000000000000000000000000000000000..3ed9825e1ed5af8b235173a266522663c23194b8 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/.ninja_log @@ -0,0 +1,10 @@ +# ninja log v5 +3 16339 1716815443290452011 /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate_gpu.o 1110d6105579f2ec +2 16424 1716815443382452213 /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points_gpu.o d52b03808d6848ab +1 17576 1716815444530454744 /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query_gpu.o 4f1d3dbcad794901 +4 17716 1716815444666455043 /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling_gpu.o 13eb66e506d5b99f +2 18032 1716815444986455748 /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points.o 85685dcd2a08be4c +2 18178 1716815445154456117 /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/bindings.o f599a3ced814a9dd +1 18190 1716815445162456135 /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query.o 3be9456cf5d2266e +3 18398 1716815445374456601 /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling.o 5d937f5708244c2e +3 18865 1716815445834457612 /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate.o 42bd13a6265f3342 diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/build.ninja b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/build.ninja new file mode 100644 index 0000000000000000000000000000000000000000..27569443f73a22d3f2c7881939f0dcea82262717 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/build.ninja @@ -0,0 +1,36 @@ +ninja_required_version = 1.3 +cxx = c++ +nvcc = /share/softwares/cuda_cudnn/cuda-11.1/bin/nvcc + +cflags = -pthread -B /share/softwares/anaconda/anaconda3/envs/openyolo3d/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /share/softwares/anaconda/anaconda3/envs/openyolo3d/include -fPIC -O2 -isystem /share/softwares/anaconda/anaconda3/envs/openyolo3d/include -fPIC -I/home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/lib/python3.10/site-packages/torch/include -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/lib/python3.10/site-packages/torch/include/TH -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/lib/python3.10/site-packages/torch/include/THC -I/share/softwares/cuda_cudnn/cuda-11.1/include -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/include/python3.10 -c +post_cflags = -O2 -Ipointnet2/_ext_src/include -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14 +cuda_cflags = -I/home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/lib/python3.10/site-packages/torch/include -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/lib/python3.10/site-packages/torch/include/TH -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/lib/python3.10/site-packages/torch/include/THC -I/share/softwares/cuda_cudnn/cuda-11.1/include -I/share/softwares/anaconda/anaconda3/envs/openyolo3d/include/python3.10 -c +cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -O2 -Ipointnet2/_ext_src/include -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 -std=c++14 +ldflags = + +rule compile + command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags + depfile = $out.d + deps = gcc + +rule cuda_compile + depfile = $out.d + deps = gcc + command = $nvcc $cuda_cflags -c $in -o $out $cuda_post_cflags + + + +build /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query.o: compile /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/ball_query.cpp +build /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query_gpu.o: cuda_compile /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/ball_query_gpu.cu +build /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/bindings.o: compile /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/bindings.cpp +build /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points.o: compile /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/group_points.cpp +build /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points_gpu.o: cuda_compile /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/group_points_gpu.cu +build /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate.o: compile /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/interpolate.cpp +build /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate_gpu.o: cuda_compile /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/interpolate_gpu.cu +build /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling.o: compile /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/sampling.cpp +build /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling_gpu.o: cuda_compile /home/jean/Amine/OpenYolo3D/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/sampling_gpu.cu + + + + + diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query.o b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query.o new file mode 100644 index 0000000000000000000000000000000000000000..d7cf059dd2a10519a1a2dfa1606e17d419148da8 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query.o differ diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query_gpu.o b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query_gpu.o new file mode 100644 index 0000000000000000000000000000000000000000..47f49a83344f0df199aa93a25772b33380bcb391 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/ball_query_gpu.o differ diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/bindings.o b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/bindings.o new file mode 100644 index 0000000000000000000000000000000000000000..a18c246b7f759d4c07a684c31676cf0948c914d2 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/bindings.o differ diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points.o b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points.o new file mode 100644 index 0000000000000000000000000000000000000000..71750c1433bc62049a1838265a12e50659fbe6ea Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points.o differ diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points_gpu.o b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points_gpu.o new file mode 100644 index 0000000000000000000000000000000000000000..2d84a952fe152e6cd7309be9471ff0931e6c169c Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/group_points_gpu.o differ diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate.o b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate.o new file mode 100644 index 0000000000000000000000000000000000000000..8f9705a09079a62f19eee309133381e53ba46a57 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate.o differ diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate_gpu.o b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate_gpu.o new file mode 100644 index 0000000000000000000000000000000000000000..0d7e1c0499487c2543c01b992af187b2e4fe0440 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/interpolate_gpu.o differ diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling.o b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling.o new file mode 100644 index 0000000000000000000000000000000000000000..342c77261194bc02ff57a9e45077a887a225aa89 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling.o differ diff --git a/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling_gpu.o b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling_gpu.o new file mode 100644 index 0000000000000000000000000000000000000000..56dd2e791c21a139b8794f5b88f022b64ae3a459 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/build/temp.linux-x86_64-cpython-310/pointnet2/_ext_src/src/sampling_gpu.o differ diff --git a/models/Mask3D/third_party/pointnet2/dist/pointnet2-0.0.0-py3.10-linux-x86_64.egg b/models/Mask3D/third_party/pointnet2/dist/pointnet2-0.0.0-py3.10-linux-x86_64.egg new file mode 100644 index 0000000000000000000000000000000000000000..ef040e5e6dddece40687ac9a0232871391ddf082 Binary files /dev/null and b/models/Mask3D/third_party/pointnet2/dist/pointnet2-0.0.0-py3.10-linux-x86_64.egg differ diff --git a/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/PKG-INFO b/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..c44293899fb386f21d7a5c12d93d5cfbfb733e65 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/PKG-INFO @@ -0,0 +1,3 @@ +Metadata-Version: 2.1 +Name: pointnet2 +Version: 0.0.0 diff --git a/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/SOURCES.txt b/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..da8c33f523afb3cf3c8c95eab3ac929831e01c75 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/SOURCES.txt @@ -0,0 +1,19 @@ +setup.py +pointnet2/__init__.py +pointnet2/pointnet2_modules.py +pointnet2/pointnet2_test.py +pointnet2/pointnet2_utils.py +pointnet2/pytorch_utils.py +pointnet2.egg-info/PKG-INFO +pointnet2.egg-info/SOURCES.txt +pointnet2.egg-info/dependency_links.txt +pointnet2.egg-info/top_level.txt +pointnet2/_ext_src/src/ball_query.cpp +pointnet2/_ext_src/src/ball_query_gpu.cu +pointnet2/_ext_src/src/bindings.cpp +pointnet2/_ext_src/src/group_points.cpp +pointnet2/_ext_src/src/group_points_gpu.cu +pointnet2/_ext_src/src/interpolate.cpp +pointnet2/_ext_src/src/interpolate_gpu.cu +pointnet2/_ext_src/src/sampling.cpp +pointnet2/_ext_src/src/sampling_gpu.cu \ No newline at end of file diff --git a/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/dependency_links.txt b/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/top_level.txt b/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..047cf37410a3a0fb88a8ba9fce0fa74cefa80d8f --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2.egg-info/top_level.txt @@ -0,0 +1 @@ +pointnet2 diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/__init__.py b/models/Mask3D/third_party/pointnet2/pointnet2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/ball_query.h b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/ball_query.h new file mode 100644 index 0000000000000000000000000000000000000000..b4feff83f2a307f13e13b4794ff942850c99127e --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/ball_query.h @@ -0,0 +1,7 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + +#pragma once +#include + +at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, + const int nsample); diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/cuda_utils.h b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/cuda_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f746526af880edc8de3563785db784fe205354bd --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/cuda_utils.h @@ -0,0 +1,43 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + +#ifndef _CUDA_UTILS_H +#define _CUDA_UTILS_H + +#include +#include +#include + +#include +#include + +#include + +#define TOTAL_THREADS 512 + +inline int opt_n_threads(int work_size) { + const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); + + return max(min(1 << pow_2, TOTAL_THREADS), 1); +} + +inline dim3 opt_block_config(int x, int y) { + const int x_threads = opt_n_threads(x); + const int y_threads = + max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); + dim3 block_config(x_threads, y_threads, 1); + + return block_config; +} + +#define CUDA_CHECK_ERRORS() \ + do { \ + cudaError_t err = cudaGetLastError(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n", \ + cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \ + __FILE__); \ + exit(-1); \ + } \ + } while (0) + +#endif diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/group_points.h b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/group_points.h new file mode 100644 index 0000000000000000000000000000000000000000..97be802326d57cb9311bfde309bf39b215a1513e --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/group_points.h @@ -0,0 +1,8 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#pragma once +#include + +at::Tensor group_points(at::Tensor points, at::Tensor idx); +at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/interpolate.h b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/interpolate.h new file mode 100644 index 0000000000000000000000000000000000000000..e7fb7923425a29aaaa7bef941463e4854ba2b991 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/interpolate.h @@ -0,0 +1,12 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + +#pragma once + +#include +#include + +std::vector three_nn(at::Tensor unknowns, at::Tensor knows); +at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, + at::Tensor weight); +at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, + at::Tensor weight, const int m); diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/sampling.h b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/sampling.h new file mode 100644 index 0000000000000000000000000000000000000000..7de473e60ef260756547997986ec7f026a4a27f2 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/sampling.h @@ -0,0 +1,9 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#pragma once +#include + +at::Tensor gather_points(at::Tensor points, at::Tensor idx); +at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); +at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples); diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/utils.h b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..815dabb20f6e1bead7e004551abfa48598802d35 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/include/utils.h @@ -0,0 +1,28 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#pragma once +#include +#include + +#define CHECK_CUDA(x) \ + do { \ + AT_ASSERT(x.is_cuda(), #x " must be a CUDA tensor"); \ + } while (0) + +#define CHECK_CONTIGUOUS(x) \ + do { \ + AT_ASSERT(x.is_contiguous(), #x " must be a contiguous tensor"); \ + } while (0) + +#define CHECK_IS_INT(x) \ + do { \ + AT_ASSERT(x.scalar_type() == at::ScalarType::Int, \ + #x " must be an int tensor"); \ + } while (0) + +#define CHECK_IS_FLOAT(x) \ + do { \ + AT_ASSERT(x.scalar_type() == at::ScalarType::Float, \ + #x " must be a float tensor"); \ + } while (0) diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/ball_query.cpp b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/ball_query.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7dd77d5f32332eb6db1535df6bf954cec17e6502 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/ball_query.cpp @@ -0,0 +1,35 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#include "ball_query.h" +#include "utils.h" + +void query_ball_point_kernel_wrapper(int b, int n, int m, float radius, + int nsample, const float *new_xyz, + const float *xyz, int *idx); + +at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, + const int nsample) { + CHECK_CONTIGUOUS(new_xyz); + CHECK_CONTIGUOUS(xyz); + CHECK_IS_FLOAT(new_xyz); + CHECK_IS_FLOAT(xyz); + + if (new_xyz.is_cuda()) { + CHECK_CUDA(xyz); + } + + at::Tensor idx = + torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample}, + at::device(new_xyz.device()).dtype(at::ScalarType::Int)); + + if (new_xyz.is_cuda()) { + query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), + radius, nsample, new_xyz.data(), + xyz.data(), idx.data()); + } else { + AT_ASSERT(false, "CPU not supported"); + } + + return idx; +} diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/ball_query_gpu.cu b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/ball_query_gpu.cu new file mode 100644 index 0000000000000000000000000000000000000000..cee88cb354999d7cb5a61e0d40216b8692c44265 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/ball_query_gpu.cu @@ -0,0 +1,57 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#include +#include +#include + +#include "cuda_utils.h" + +// input: new_xyz(b, m, 3) xyz(b, n, 3) +// output: idx(b, m, nsample) +__global__ void query_ball_point_kernel(int b, int n, int m, float radius, + int nsample, + const float *__restrict__ new_xyz, + const float *__restrict__ xyz, + int *__restrict__ idx) { + int batch_index = blockIdx.x; + xyz += batch_index * n * 3; + new_xyz += batch_index * m * 3; + idx += m * nsample * batch_index; + + int index = threadIdx.x; + int stride = blockDim.x; + + float radius2 = radius * radius; + for (int j = index; j < m; j += stride) { + float new_x = new_xyz[j * 3 + 0]; + float new_y = new_xyz[j * 3 + 1]; + float new_z = new_xyz[j * 3 + 2]; + for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) { + float x = xyz[k * 3 + 0]; + float y = xyz[k * 3 + 1]; + float z = xyz[k * 3 + 2]; + float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + + (new_z - z) * (new_z - z); + if (d2 < radius2) { + if (cnt == 0) { + for (int l = 0; l < nsample; ++l) { + idx[j * nsample + l] = k; + } + } + idx[j * nsample + cnt] = k; + ++cnt; + } + } + } +} + +void query_ball_point_kernel_wrapper(int b, int n, int m, float radius, + int nsample, const float *new_xyz, + const float *xyz, int *idx) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + query_ball_point_kernel<<>>( + b, n, m, radius, nsample, new_xyz, xyz, idx); + + CUDA_CHECK_ERRORS(); +} diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/bindings.cpp b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/bindings.cpp new file mode 100644 index 0000000000000000000000000000000000000000..58d6c2d25a388ca49016dc8bedf7ac8fabe8fe0b --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/bindings.cpp @@ -0,0 +1,22 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#include "ball_query.h" +#include "group_points.h" +#include "interpolate.h" +#include "sampling.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("gather_points", &gather_points); + m.def("gather_points_grad", &gather_points_grad); + m.def("furthest_point_sampling", &furthest_point_sampling); + + m.def("three_nn", &three_nn); + m.def("three_interpolate", &three_interpolate); + m.def("three_interpolate_grad", &three_interpolate_grad); + + m.def("ball_query", &ball_query); + + m.def("group_points", &group_points); + m.def("group_points_grad", &group_points_grad); +} diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/group_points.cpp b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/group_points.cpp new file mode 100644 index 0000000000000000000000000000000000000000..22998dd7f40e678de6dd7ed32c8b411ad3a438e8 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/group_points.cpp @@ -0,0 +1,63 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#include "group_points.h" +#include "utils.h" + +void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample, + const float *points, const int *idx, + float *out); + +void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + int nsample, const float *grad_out, + const int *idx, float *grad_points); + +at::Tensor group_points(at::Tensor points, at::Tensor idx) { + CHECK_CONTIGUOUS(points); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(points); + CHECK_IS_INT(idx); + + if (points.is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({points.size(0), points.size(1), idx.size(1), idx.size(2)}, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.is_cuda()) { + group_points_kernel_wrapper(points.size(0), points.size(1), points.size(2), + idx.size(1), idx.size(2), points.data(), + idx.data(), output.data()); + } else { + AT_ASSERT(false, "CPU not supported"); + } + + return output; +} + +at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) { + CHECK_CONTIGUOUS(grad_out); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(grad_out); + CHECK_IS_INT(idx); + + if (grad_out.is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({grad_out.size(0), grad_out.size(1), n}, + at::device(grad_out.device()).dtype(at::ScalarType::Float)); + + if (grad_out.is_cuda()) { + group_points_grad_kernel_wrapper( + grad_out.size(0), grad_out.size(1), n, idx.size(1), idx.size(2), + grad_out.data(), idx.data(), output.data()); + } else { + AT_ASSERT(false, "CPU not supported"); + } + + return output; +} diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/group_points_gpu.cu b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/group_points_gpu.cu new file mode 100644 index 0000000000000000000000000000000000000000..e36672e7476843f557035ab7e709e112b1b829da --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/group_points_gpu.cu @@ -0,0 +1,78 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#include +#include + +#include "cuda_utils.h" + +// input: points(b, c, n) idx(b, npoints, nsample) +// output: out(b, c, npoints, nsample) +__global__ void group_points_kernel(int b, int c, int n, int npoints, + int nsample, + const float *__restrict__ points, + const int *__restrict__ idx, + float *__restrict__ out) { + int batch_index = blockIdx.x; + points += batch_index * n * c; + idx += batch_index * npoints * nsample; + out += batch_index * npoints * nsample * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * npoints; i += stride) { + const int l = i / npoints; + const int j = i % npoints; + for (int k = 0; k < nsample; ++k) { + int ii = idx[j * nsample + k]; + out[(l * npoints + j) * nsample + k] = points[l * n + ii]; + } + } +} + +void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample, + const float *points, const int *idx, + float *out) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + group_points_kernel<<>>( + b, c, n, npoints, nsample, points, idx, out); + + CUDA_CHECK_ERRORS(); +} + +// input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample) +// output: grad_points(b, c, n) +__global__ void group_points_grad_kernel(int b, int c, int n, int npoints, + int nsample, + const float *__restrict__ grad_out, + const int *__restrict__ idx, + float *__restrict__ grad_points) { + int batch_index = blockIdx.x; + grad_out += batch_index * npoints * nsample * c; + idx += batch_index * npoints * nsample; + grad_points += batch_index * n * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * npoints; i += stride) { + const int l = i / npoints; + const int j = i % npoints; + for (int k = 0; k < nsample; ++k) { + int ii = idx[j * nsample + k]; + atomicAdd(grad_points + l * n + ii, + grad_out[(l * npoints + j) * nsample + k]); + } + } +} + +void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + int nsample, const float *grad_out, + const int *idx, float *grad_points) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + group_points_grad_kernel<<>>( + b, c, n, npoints, nsample, grad_out, idx, grad_points); + + CUDA_CHECK_ERRORS(); +} diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/interpolate.cpp b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/interpolate.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4b680c5202a1aea10dbceaf21e010c6a83c54932 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/interpolate.cpp @@ -0,0 +1,101 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + +#include "interpolate.h" +#include "utils.h" + +void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown, + const float *known, float *dist2, int *idx); +void three_interpolate_kernel_wrapper(int b, int c, int m, int n, + const float *points, const int *idx, + const float *weight, float *out); +void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, + const float *grad_out, + const int *idx, const float *weight, + float *grad_points); + +std::vector three_nn(at::Tensor unknowns, at::Tensor knows) { + CHECK_CONTIGUOUS(unknowns); + CHECK_CONTIGUOUS(knows); + CHECK_IS_FLOAT(unknowns); + CHECK_IS_FLOAT(knows); + + if (unknowns.is_cuda()) { + CHECK_CUDA(knows); + } + + at::Tensor idx = + torch::zeros({unknowns.size(0), unknowns.size(1), 3}, + at::device(unknowns.device()).dtype(at::ScalarType::Int)); + at::Tensor dist2 = + torch::zeros({unknowns.size(0), unknowns.size(1), 3}, + at::device(unknowns.device()).dtype(at::ScalarType::Float)); + + if (unknowns.is_cuda()) { + three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1), + unknowns.data(), knows.data(), + dist2.data(), idx.data()); + } else { + AT_ASSERT(false, "CPU not supported"); + } + + return {dist2, idx}; +} + +at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, + at::Tensor weight) { + CHECK_CONTIGUOUS(points); + CHECK_CONTIGUOUS(idx); + CHECK_CONTIGUOUS(weight); + CHECK_IS_FLOAT(points); + CHECK_IS_INT(idx); + CHECK_IS_FLOAT(weight); + + if (points.is_cuda()) { + CHECK_CUDA(idx); + CHECK_CUDA(weight); + } + + at::Tensor output = + torch::zeros({points.size(0), points.size(1), idx.size(1)}, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.is_cuda()) { + three_interpolate_kernel_wrapper( + points.size(0), points.size(1), points.size(2), idx.size(1), + points.data(), idx.data(), weight.data(), + output.data()); + } else { + AT_ASSERT(false, "CPU not supported"); + } + + return output; +} +at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, + at::Tensor weight, const int m) { + CHECK_CONTIGUOUS(grad_out); + CHECK_CONTIGUOUS(idx); + CHECK_CONTIGUOUS(weight); + CHECK_IS_FLOAT(grad_out); + CHECK_IS_INT(idx); + CHECK_IS_FLOAT(weight); + + if (grad_out.is_cuda()) { + CHECK_CUDA(idx); + CHECK_CUDA(weight); + } + + at::Tensor output = + torch::zeros({grad_out.size(0), grad_out.size(1), m}, + at::device(grad_out.device()).dtype(at::ScalarType::Float)); + + if (grad_out.is_cuda()) { + three_interpolate_grad_kernel_wrapper( + grad_out.size(0), grad_out.size(1), grad_out.size(2), m, + grad_out.data(), idx.data(), weight.data(), + output.data()); + } else { + AT_ASSERT(false, "CPU not supported"); + } + + return output; +} diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/interpolate_gpu.cu b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/interpolate_gpu.cu new file mode 100644 index 0000000000000000000000000000000000000000..b4c56440f316881695a9030a6bd508e43c7fc2b8 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/interpolate_gpu.cu @@ -0,0 +1,157 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#include +#include +#include + +#include "cuda_utils.h" + +// input: unknown(b, n, 3) known(b, m, 3) +// output: dist2(b, n, 3), idx(b, n, 3) +__global__ void three_nn_kernel(int b, int n, int m, + const float *__restrict__ unknown, + const float *__restrict__ known, + float *__restrict__ dist2, + int *__restrict__ idx) { + int batch_index = blockIdx.x; + unknown += batch_index * n * 3; + known += batch_index * m * 3; + dist2 += batch_index * n * 3; + idx += batch_index * n * 3; + + int index = threadIdx.x; + int stride = blockDim.x; + for (int j = index; j < n; j += stride) { + float ux = unknown[j * 3 + 0]; + float uy = unknown[j * 3 + 1]; + float uz = unknown[j * 3 + 2]; + + double best1 = 1e40, best2 = 1e40, best3 = 1e40; + int besti1 = 0, besti2 = 0, besti3 = 0; + for (int k = 0; k < m; ++k) { + float x = known[k * 3 + 0]; + float y = known[k * 3 + 1]; + float z = known[k * 3 + 2]; + float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); + if (d < best1) { + best3 = best2; + besti3 = besti2; + best2 = best1; + besti2 = besti1; + best1 = d; + besti1 = k; + } else if (d < best2) { + best3 = best2; + besti3 = besti2; + best2 = d; + besti2 = k; + } else if (d < best3) { + best3 = d; + besti3 = k; + } + } + dist2[j * 3 + 0] = best1; + dist2[j * 3 + 1] = best2; + dist2[j * 3 + 2] = best3; + + idx[j * 3 + 0] = besti1; + idx[j * 3 + 1] = besti2; + idx[j * 3 + 2] = besti3; + } +} + +void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown, + const float *known, float *dist2, int *idx) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + three_nn_kernel<<>>(b, n, m, unknown, known, + dist2, idx); + + CUDA_CHECK_ERRORS(); +} + +// input: points(b, c, m), idx(b, n, 3), weight(b, n, 3) +// output: out(b, c, n) +__global__ void three_interpolate_kernel(int b, int c, int m, int n, + const float *__restrict__ points, + const int *__restrict__ idx, + const float *__restrict__ weight, + float *__restrict__ out) { + int batch_index = blockIdx.x; + points += batch_index * m * c; + + idx += batch_index * n * 3; + weight += batch_index * n * 3; + + out += batch_index * n * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * n; i += stride) { + const int l = i / n; + const int j = i % n; + float w1 = weight[j * 3 + 0]; + float w2 = weight[j * 3 + 1]; + float w3 = weight[j * 3 + 2]; + + int i1 = idx[j * 3 + 0]; + int i2 = idx[j * 3 + 1]; + int i3 = idx[j * 3 + 2]; + + out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 + + points[l * m + i3] * w3; + } +} + +void three_interpolate_kernel_wrapper(int b, int c, int m, int n, + const float *points, const int *idx, + const float *weight, float *out) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + three_interpolate_kernel<<>>( + b, c, m, n, points, idx, weight, out); + + CUDA_CHECK_ERRORS(); +} + +// input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3) +// output: grad_points(b, c, m) + +__global__ void three_interpolate_grad_kernel( + int b, int c, int n, int m, const float *__restrict__ grad_out, + const int *__restrict__ idx, const float *__restrict__ weight, + float *__restrict__ grad_points) { + int batch_index = blockIdx.x; + grad_out += batch_index * n * c; + idx += batch_index * n * 3; + weight += batch_index * n * 3; + grad_points += batch_index * m * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * n; i += stride) { + const int l = i / n; + const int j = i % n; + float w1 = weight[j * 3 + 0]; + float w2 = weight[j * 3 + 1]; + float w3 = weight[j * 3 + 2]; + + int i1 = idx[j * 3 + 0]; + int i2 = idx[j * 3 + 1]; + int i3 = idx[j * 3 + 2]; + + atomicAdd(grad_points + l * m + i1, grad_out[i] * w1); + atomicAdd(grad_points + l * m + i2, grad_out[i] * w2); + atomicAdd(grad_points + l * m + i3, grad_out[i] * w3); + } +} + +void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, + const float *grad_out, + const int *idx, const float *weight, + float *grad_points) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + three_interpolate_grad_kernel<<>>( + b, c, n, m, grad_out, idx, weight, grad_points); + + CUDA_CHECK_ERRORS(); +} diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/sampling.cpp b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/sampling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de55822a958c545f36e35264067f41c28f660286 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/sampling.cpp @@ -0,0 +1,88 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + +#include "sampling.h" +#include "utils.h" + +void gather_points_kernel_wrapper(int b, int c, int n, int npoints, + const float *points, const int *idx, + float *out); +void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + const float *grad_out, const int *idx, + float *grad_points); + +void furthest_point_sampling_kernel_wrapper(int b, int n, int m, + const float *dataset, float *temp, + int *idxs); + +at::Tensor gather_points(at::Tensor points, at::Tensor idx) { + CHECK_CONTIGUOUS(points); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(points); + CHECK_IS_INT(idx); + + if (points.is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({points.size(0), points.size(1), idx.size(1)}, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.is_cuda()) { + gather_points_kernel_wrapper(points.size(0), points.size(1), points.size(2), + idx.size(1), points.data(), + idx.data(), output.data()); + } else { + AT_ASSERT(false, "CPU not supported"); + } + + return output; +} + +at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, + const int n) { + CHECK_CONTIGUOUS(grad_out); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(grad_out); + CHECK_IS_INT(idx); + + if (grad_out.is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({grad_out.size(0), grad_out.size(1), n}, + at::device(grad_out.device()).dtype(at::ScalarType::Float)); + + if (grad_out.is_cuda()) { + gather_points_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), n, + idx.size(1), grad_out.data(), + idx.data(), output.data()); + } else { + AT_ASSERT(false, "CPU not supported"); + } + + return output; +} +at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) { + CHECK_CONTIGUOUS(points); + CHECK_IS_FLOAT(points); + + at::Tensor output = + torch::zeros({points.size(0), nsamples}, + at::device(points.device()).dtype(at::ScalarType::Int)); + + at::Tensor tmp = + torch::full({points.size(0), points.size(1)}, 1e10, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.is_cuda()) { + furthest_point_sampling_kernel_wrapper( + points.size(0), points.size(1), nsamples, points.data(), + tmp.data(), output.data()); + } else { + AT_ASSERT(false, "CPU not supported"); + } + + return output; +} diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/sampling_gpu.cu b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/sampling_gpu.cu new file mode 100644 index 0000000000000000000000000000000000000000..d2b3707f3d3950a49f70c3ccdfad3a53107d1ad4 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/_ext_src/src/sampling_gpu.cu @@ -0,0 +1,232 @@ +// Copyright (c) Facebook, Inc. and its affiliates. + + +#include +#include + +#include "cuda_utils.h" + +// input: points(b, c, n) idx(b, m) +// output: out(b, c, m) +__global__ void gather_points_kernel(int b, int c, int n, int m, + const float *__restrict__ points, + const int *__restrict__ idx, + float *__restrict__ out) { + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int l = blockIdx.y; l < c; l += gridDim.y) { + for (int j = threadIdx.x; j < m; j += blockDim.x) { + int a = idx[i * m + j]; + out[(i * c + l) * m + j] = points[(i * c + l) * n + a]; + } + } + } +} + +void gather_points_kernel_wrapper(int b, int c, int n, int npoints, + const float *points, const int *idx, + float *out) { + gather_points_kernel<<>>(b, c, n, npoints, + points, idx, out); + + CUDA_CHECK_ERRORS(); +} + +// input: grad_out(b, c, m) idx(b, m) +// output: grad_points(b, c, n) +__global__ void gather_points_grad_kernel(int b, int c, int n, int m, + const float *__restrict__ grad_out, + const int *__restrict__ idx, + float *__restrict__ grad_points) { + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int l = blockIdx.y; l < c; l += gridDim.y) { + for (int j = threadIdx.x; j < m; j += blockDim.x) { + int a = idx[i * m + j]; + atomicAdd(grad_points + (i * c + l) * n + a, + grad_out[(i * c + l) * m + j]); + } + } + } +} + +void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + const float *grad_out, const int *idx, + float *grad_points) { + gather_points_grad_kernel<<>>( + b, c, n, npoints, grad_out, idx, grad_points); + + CUDA_CHECK_ERRORS(); +} + +__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, + int idx1, int idx2) { + const float v1 = dists[idx1], v2 = dists[idx2]; + const int i1 = dists_i[idx1], i2 = dists_i[idx2]; + dists[idx1] = max(v1, v2); + dists_i[idx1] = v2 > v1 ? i2 : i1; +} + +// Input dataset: (b, n, 3), tmp: (b, n) +// Ouput idxs (b, m) +template +__global__ void furthest_point_sampling_kernel( + int b, int n, int m, const float *__restrict__ dataset, + float *__restrict__ temp, int *__restrict__ idxs) { + if (m <= 0) return; + __shared__ float dists[block_size]; + __shared__ int dists_i[block_size]; + + int batch_index = blockIdx.x; + dataset += batch_index * n * 3; + temp += batch_index * n; + idxs += batch_index * m; + + int tid = threadIdx.x; + const int stride = block_size; + + int old = 0; + if (threadIdx.x == 0) idxs[0] = old; + + __syncthreads(); + for (int j = 1; j < m; j++) { + int besti = 0; + float best = -1; + float x1 = dataset[old * 3 + 0]; + float y1 = dataset[old * 3 + 1]; + float z1 = dataset[old * 3 + 2]; + for (int k = tid; k < n; k += stride) { + float x2, y2, z2; + x2 = dataset[k * 3 + 0]; + y2 = dataset[k * 3 + 1]; + z2 = dataset[k * 3 + 2]; + float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); + if (mag <= 1e-3) continue; + + float d = + (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); + + float d2 = min(d, temp[k]); + temp[k] = d2; + besti = d2 > best ? k : besti; + best = d2 > best ? d2 : best; + } + dists[tid] = best; + dists_i[tid] = besti; + __syncthreads(); + + if (block_size >= 512) { + if (tid < 256) { + __update(dists, dists_i, tid, tid + 256); + } + __syncthreads(); + } + if (block_size >= 256) { + if (tid < 128) { + __update(dists, dists_i, tid, tid + 128); + } + __syncthreads(); + } + if (block_size >= 128) { + if (tid < 64) { + __update(dists, dists_i, tid, tid + 64); + } + __syncthreads(); + } + if (block_size >= 64) { + if (tid < 32) { + __update(dists, dists_i, tid, tid + 32); + } + __syncthreads(); + } + if (block_size >= 32) { + if (tid < 16) { + __update(dists, dists_i, tid, tid + 16); + } + __syncthreads(); + } + if (block_size >= 16) { + if (tid < 8) { + __update(dists, dists_i, tid, tid + 8); + } + __syncthreads(); + } + if (block_size >= 8) { + if (tid < 4) { + __update(dists, dists_i, tid, tid + 4); + } + __syncthreads(); + } + if (block_size >= 4) { + if (tid < 2) { + __update(dists, dists_i, tid, tid + 2); + } + __syncthreads(); + } + if (block_size >= 2) { + if (tid < 1) { + __update(dists, dists_i, tid, tid + 1); + } + __syncthreads(); + } + + old = dists_i[0]; + if (tid == 0) idxs[j] = old; + } +} + +void furthest_point_sampling_kernel_wrapper(int b, int n, int m, + const float *dataset, float *temp, + int *idxs) { + unsigned int n_threads = opt_n_threads(n); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + switch (n_threads) { + case 512: + furthest_point_sampling_kernel<512> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 256: + furthest_point_sampling_kernel<256> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 128: + furthest_point_sampling_kernel<128> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 64: + furthest_point_sampling_kernel<64> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 32: + furthest_point_sampling_kernel<32> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 16: + furthest_point_sampling_kernel<16> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 8: + furthest_point_sampling_kernel<8> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 4: + furthest_point_sampling_kernel<4> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 2: + furthest_point_sampling_kernel<2> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 1: + furthest_point_sampling_kernel<1> + <<>>(b, n, m, dataset, temp, idxs); + break; + default: + furthest_point_sampling_kernel<512> + <<>>(b, n, m, dataset, temp, idxs); + } + + CUDA_CHECK_ERRORS(); +} diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/pointnet2_modules.py b/models/Mask3D/third_party/pointnet2/pointnet2/pointnet2_modules.py new file mode 100644 index 0000000000000000000000000000000000000000..2e82cdc249bd2a6cd8e87940a2103ce4438908d8 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/pointnet2_modules.py @@ -0,0 +1,581 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +""" Pointnet2 layers. +Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch +Extended with the following: +1. Uniform sampling in each local region (sample_uniformly) +2. Return sampled points indices to support votenet. +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +import os +import sys + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) + +import pointnet2_utils +import pytorch_utils as pt_utils +from typing import List + + +class _PointnetSAModuleBase(nn.Module): + def __init__(self): + super().__init__() + self.npoint = None + self.groupers = None + self.mlps = None + + def forward( + self, xyz: torch.Tensor, features: torch.Tensor = None + ) -> (torch.Tensor, torch.Tensor): + r""" + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor of the xyz coordinates of the features + features : torch.Tensor + (B, N, C) tensor of the descriptors of the the features + + Returns + ------- + new_xyz : torch.Tensor + (B, npoint, 3) tensor of the new features' xyz + new_features : torch.Tensor + (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors + """ + + new_features_list = [] + + xyz_flipped = xyz.transpose(1, 2).contiguous() + new_xyz = ( + pointnet2_utils.gather_operation( + xyz_flipped, + pointnet2_utils.furthest_point_sample(xyz, self.npoint), + ) + .transpose(1, 2) + .contiguous() + if self.npoint is not None + else None + ) + + for i in range(len(self.groupers)): + new_features = self.groupers[i]( + xyz, new_xyz, features + ) # (B, C, npoint, nsample) + + new_features = self.mlps[i]( + new_features + ) # (B, mlp[-1], npoint, nsample) + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) + + new_features_list.append(new_features) + + return new_xyz, torch.cat(new_features_list, dim=1) + + +class PointnetSAModuleMSG(_PointnetSAModuleBase): + r"""Pointnet set abstrction layer with multiscale grouping + + Parameters + ---------- + npoint : int + Number of features + radii : list of float32 + list of radii to group with + nsamples : list of int32 + Number of samples in each ball query + mlps : list of list of int32 + Spec of the pointnet before the global max_pool for each scale + bn : bool + Use batchnorm + """ + + def __init__( + self, + *, + npoint: int, + radii: List[float], + nsamples: List[int], + mlps: List[List[int]], + bn: bool = True, + use_xyz: bool = True, + sample_uniformly: bool = False + ): + super().__init__() + + assert len(radii) == len(nsamples) == len(mlps) + + self.npoint = npoint + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + for i in range(len(radii)): + radius = radii[i] + nsample = nsamples[i] + self.groupers.append( + pointnet2_utils.QueryAndGroup( + radius, + nsample, + use_xyz=use_xyz, + sample_uniformly=sample_uniformly, + ) + if npoint is not None + else pointnet2_utils.GroupAll(use_xyz) + ) + mlp_spec = mlps[i] + if use_xyz: + mlp_spec[0] += 3 + + self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) + + +class PointnetSAModule(PointnetSAModuleMSG): + r"""Pointnet set abstrction layer + + Parameters + ---------- + npoint : int + Number of features + radius : float + Radius of ball + nsample : int + Number of samples in the ball query + mlp : list + Spec of the pointnet before the global max_pool + bn : bool + Use batchnorm + """ + + def __init__( + self, + *, + mlp: List[int], + npoint: int = None, + radius: float = None, + nsample: int = None, + bn: bool = True, + use_xyz: bool = True + ): + super().__init__( + mlps=[mlp], + npoint=npoint, + radii=[radius], + nsamples=[nsample], + bn=bn, + use_xyz=use_xyz, + ) + + +class PointnetSAModuleVotes(nn.Module): + """Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG + with extra support for returning point indices for getting their GT votes""" + + def __init__( + self, + *, + mlp: List[int], + npoint: int = None, + radius: float = None, + nsample: int = None, + bn: bool = True, + use_xyz: bool = True, + pooling: str = "max", + sigma: float = None, # for RBF pooling + normalize_xyz: bool = False, # noramlize local XYZ with radius + sample_uniformly: bool = False, + ret_unique_cnt: bool = False + ): + super().__init__() + self.npoint = npoint + self.radius = radius + self.nsample = nsample + self.pooling = pooling + self.mlp_module = None + self.use_xyz = use_xyz + self.sigma = sigma + if self.sigma is None: + self.sigma = self.radius / 2 + self.normalize_xyz = normalize_xyz + self.ret_unique_cnt = ret_unique_cnt + + if npoint is not None: + self.grouper = pointnet2_utils.QueryAndGroup( + radius, + nsample, + use_xyz=use_xyz, + ret_grouped_xyz=True, + normalize_xyz=normalize_xyz, + sample_uniformly=sample_uniformly, + ret_unique_cnt=ret_unique_cnt, + ) + else: + self.grouper = pointnet2_utils.GroupAll( + use_xyz, ret_grouped_xyz=True + ) + + mlp_spec = mlp + if use_xyz and len(mlp_spec) > 0: + mlp_spec[0] += 3 + self.mlp_module = pt_utils.SharedMLP(mlp_spec, bn=bn) + + def forward( + self, + xyz: torch.Tensor, + features: torch.Tensor = None, + inds: torch.Tensor = None, + ) -> (torch.Tensor, torch.Tensor): + r""" + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor of the xyz coordinates of the features + features : torch.Tensor + (B, C, N) tensor of the descriptors of the the features + inds : torch.Tensor + (B, npoint) tensor that stores index to the xyz points (values in 0-N-1) + + Returns + ------- + new_xyz : torch.Tensor + (B, npoint, 3) tensor of the new features' xyz + new_features : torch.Tensor + (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors + inds: torch.Tensor + (B, npoint) tensor of the inds + """ + + xyz_flipped = xyz.transpose(1, 2).contiguous() + if inds is None: + inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint) + else: + assert inds.shape[1] == self.npoint + new_xyz = ( + pointnet2_utils.gather_operation(xyz_flipped, inds) + .transpose(1, 2) + .contiguous() + if self.npoint is not None + else None + ) + + if not self.ret_unique_cnt: + grouped_features, grouped_xyz = self.grouper( + xyz, new_xyz, features + ) # (B, C, npoint, nsample) + else: + grouped_features, grouped_xyz, unique_cnt = self.grouper( + xyz, new_xyz, features + ) # (B, C, npoint, nsample), (B,3,npoint,nsample), (B,npoint) + + new_features = self.mlp_module( + grouped_features + ) # (B, mlp[-1], npoint, nsample) + if self.pooling == "max": + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + elif self.pooling == "avg": + new_features = F.avg_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + elif self.pooling == "rbf": + # Use radial basis function kernel for weighted sum of features (normalized by nsample and sigma) + # Ref: https://en.wikipedia.org/wiki/Radial_basis_function_kernel + rbf = torch.exp( + -1 + * grouped_xyz.pow(2).sum(1, keepdim=False) + / (self.sigma**2) + / 2 + ) # (B, npoint, nsample) + new_features = torch.sum( + new_features * rbf.unsqueeze(1), -1, keepdim=True + ) / float( + self.nsample + ) # (B, mlp[-1], npoint, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) + + if not self.ret_unique_cnt: + return new_xyz, new_features, inds + else: + return new_xyz, new_features, inds, unique_cnt + + +class PointnetSAModuleMSGVotes(nn.Module): + """Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG + with extra support for returning point indices for getting their GT votes""" + + def __init__( + self, + *, + mlps: List[List[int]], + npoint: int, + radii: List[float], + nsamples: List[int], + bn: bool = True, + use_xyz: bool = True, + sample_uniformly: bool = False + ): + super().__init__() + + assert len(mlps) == len(nsamples) == len(radii) + + self.npoint = npoint + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + for i in range(len(radii)): + radius = radii[i] + nsample = nsamples[i] + self.groupers.append( + pointnet2_utils.QueryAndGroup( + radius, + nsample, + use_xyz=use_xyz, + sample_uniformly=sample_uniformly, + ) + if npoint is not None + else pointnet2_utils.GroupAll(use_xyz) + ) + mlp_spec = mlps[i] + if use_xyz: + mlp_spec[0] += 3 + + self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) + + def forward( + self, + xyz: torch.Tensor, + features: torch.Tensor = None, + inds: torch.Tensor = None, + ) -> (torch.Tensor, torch.Tensor): + r""" + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor of the xyz coordinates of the features + features : torch.Tensor + (B, C, C) tensor of the descriptors of the the features + inds : torch.Tensor + (B, npoint) tensor that stores index to the xyz points (values in 0-N-1) + + Returns + ------- + new_xyz : torch.Tensor + (B, npoint, 3) tensor of the new features' xyz + new_features : torch.Tensor + (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors + inds: torch.Tensor + (B, npoint) tensor of the inds + """ + new_features_list = [] + + xyz_flipped = xyz.transpose(1, 2).contiguous() + if inds is None: + inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint) + new_xyz = ( + pointnet2_utils.gather_operation(xyz_flipped, inds) + .transpose(1, 2) + .contiguous() + if self.npoint is not None + else None + ) + + for i in range(len(self.groupers)): + new_features = self.groupers[i]( + xyz, new_xyz, features + ) # (B, C, npoint, nsample) + new_features = self.mlps[i]( + new_features + ) # (B, mlp[-1], npoint, nsample) + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) + + new_features_list.append(new_features) + + return new_xyz, torch.cat(new_features_list, dim=1), inds + + +class PointnetFPModule(nn.Module): + r"""Propigates the features of one set to another + + Parameters + ---------- + mlp : list + Pointnet module parameters + bn : bool + Use batchnorm + """ + + def __init__(self, *, mlp: List[int], bn: bool = True): + super().__init__() + self.mlp = pt_utils.SharedMLP(mlp, bn=bn) + + def forward( + self, + unknown: torch.Tensor, + known: torch.Tensor, + unknow_feats: torch.Tensor, + known_feats: torch.Tensor, + ) -> torch.Tensor: + r""" + Parameters + ---------- + unknown : torch.Tensor + (B, n, 3) tensor of the xyz positions of the unknown features + known : torch.Tensor + (B, m, 3) tensor of the xyz positions of the known features + unknow_feats : torch.Tensor + (B, C1, n) tensor of the features to be propigated to + known_feats : torch.Tensor + (B, C2, m) tensor of features to be propigated + + Returns + ------- + new_features : torch.Tensor + (B, mlp[-1], n) tensor of the features of the unknown features + """ + + if known is not None: + dist, idx = pointnet2_utils.three_nn(unknown, known) + dist_recip = 1.0 / (dist + 1e-8) + norm = torch.sum(dist_recip, dim=2, keepdim=True) + weight = dist_recip / norm + + interpolated_feats = pointnet2_utils.three_interpolate( + known_feats, idx, weight + ) + else: + interpolated_feats = known_feats.expand( + *known_feats.size()[0:2], unknown.size(1) + ) + + if unknow_feats is not None: + new_features = torch.cat( + [interpolated_feats, unknow_feats], dim=1 + ) # (B, C2 + C1, n) + else: + new_features = interpolated_feats + + new_features = new_features.unsqueeze(-1) + new_features = self.mlp(new_features) + + return new_features.squeeze(-1) + + +class PointnetLFPModuleMSG(nn.Module): + """Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG + learnable feature propagation layer.""" + + def __init__( + self, + *, + mlps: List[List[int]], + radii: List[float], + nsamples: List[int], + post_mlp: List[int], + bn: bool = True, + use_xyz: bool = True, + sample_uniformly: bool = False + ): + super().__init__() + + assert len(mlps) == len(nsamples) == len(radii) + + self.post_mlp = pt_utils.SharedMLP(post_mlp, bn=bn) + + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + for i in range(len(radii)): + radius = radii[i] + nsample = nsamples[i] + self.groupers.append( + pointnet2_utils.QueryAndGroup( + radius, + nsample, + use_xyz=use_xyz, + sample_uniformly=sample_uniformly, + ) + ) + mlp_spec = mlps[i] + if use_xyz: + mlp_spec[0] += 3 + + self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) + + def forward( + self, + xyz2: torch.Tensor, + xyz1: torch.Tensor, + features2: torch.Tensor, + features1: torch.Tensor, + ) -> torch.Tensor: + r"""Propagate features from xyz1 to xyz2. + Parameters + ---------- + xyz2 : torch.Tensor + (B, N2, 3) tensor of the xyz coordinates of the features + xyz1 : torch.Tensor + (B, N1, 3) tensor of the xyz coordinates of the features + features2 : torch.Tensor + (B, C2, N2) tensor of the descriptors of the the features + features1 : torch.Tensor + (B, C1, N1) tensor of the descriptors of the the features + + Returns + ------- + new_features1 : torch.Tensor + (B, \sum_k(mlps[k][-1]), N1) tensor of the new_features descriptors + """ + new_features_list = [] + + for i in range(len(self.groupers)): + new_features = self.groupers[i]( + xyz1, xyz2, features1 + ) # (B, C1, N2, nsample) + new_features = self.mlps[i]( + new_features + ) # (B, mlp[-1], N2, nsample) + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], N2, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], N2) + + if features2 is not None: + new_features = torch.cat( + [new_features, features2], dim=1 + ) # (B, mlp[-1] + C2, N2) + + new_features = new_features.unsqueeze(-1) + new_features = self.post_mlp(new_features) + + new_features_list.append(new_features) + + return torch.cat(new_features_list, dim=1).squeeze(-1) + + +if __name__ == "__main__": + from torch.autograd import Variable + + torch.manual_seed(1) + torch.cuda.manual_seed_all(1) + xyz = Variable(torch.randn(2, 9, 3).cuda(), requires_grad=True) + xyz_feats = Variable(torch.randn(2, 9, 6).cuda(), requires_grad=True) + + test_module = PointnetSAModuleMSG( + npoint=2, radii=[5.0, 10.0], nsamples=[6, 3], mlps=[[9, 3], [9, 6]] + ) + test_module.cuda() + print(test_module(xyz, xyz_feats)) + + for _ in range(1): + _, new_features = test_module(xyz, xyz_feats) + new_features.backward( + torch.cuda.FloatTensor(*new_features.size()).fill_(1) + ) + print(new_features) + print(xyz.grad) diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/pointnet2_test.py b/models/Mask3D/third_party/pointnet2/pointnet2/pointnet2_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0994ff56ccbb1e9c97316ba7125b164c34f0d66b --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/pointnet2_test.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +""" Testing customized ops. """ + +import torch +from torch.autograd import gradcheck +import numpy as np + +import os +import sys + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) +import pointnet2_utils + + +def test_interpolation_grad(): + batch_size = 1 + feat_dim = 2 + m = 4 + feats = ( + torch.randn(batch_size, feat_dim, m, requires_grad=True).float().cuda() + ) + + def interpolate_func(inputs): + idx = torch.from_numpy(np.array([[[0, 1, 2], [1, 2, 3]]])).int().cuda() + weight = ( + torch.from_numpy(np.array([[[1, 1, 1], [2, 2, 2]]])).float().cuda() + ) + interpolated_feats = pointnet2_utils.three_interpolate( + inputs, idx, weight + ) + return interpolated_feats + + assert gradcheck(interpolate_func, feats, atol=1e-1, rtol=1e-1) + + +if __name__ == "__main__": + test_interpolation_grad() diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/pointnet2_utils.py b/models/Mask3D/third_party/pointnet2/pointnet2/pointnet2_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..24be3136b1c465b10599393b12a92bcb4ee96e8f --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/pointnet2_utils.py @@ -0,0 +1,438 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +""" Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch """ +from __future__ import ( + division, + absolute_import, + with_statement, + print_function, + unicode_literals, +) +import torch +from torch.autograd import Function +import torch.nn as nn +import pointnet2.pytorch_utils as pt_utils +import sys + +try: + import builtins +except: + import __builtin__ as builtins + +try: + import pointnet2._ext as _ext +except ImportError: + if not getattr(builtins, "__POINTNET2_SETUP__", False): + raise ImportError( + "Could not import _ext module.\n" + "Please see the setup instructions in the README: " + "https://github.com/erikwijmans/Pointnet2_PyTorch/blob/master/README.rst" + ) + +if False: + # Workaround for type hints without depending on the `typing` module + from typing import * + + +class RandomDropout(nn.Module): + def __init__(self, p=0.5, inplace=False): + super(RandomDropout, self).__init__() + self.p = p + self.inplace = inplace + + def forward(self, X): + theta = torch.Tensor(1).uniform_(0, self.p)[0] + return pt_utils.feature_dropout_no_scaling( + X, theta, self.train, self.inplace + ) + + +class FurthestPointSampling(Function): + @staticmethod + def forward(ctx, xyz, npoint): + # type: (Any, torch.Tensor, int) -> torch.Tensor + r""" + Uses iterative furthest point sampling to select a set of npoint features that have the largest + minimum distance + + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor where N > npoint + npoint : int32 + number of features in the sampled set + + Returns + ------- + torch.Tensor + (B, npoint) tensor containing the set + """ + fps_inds = _ext.furthest_point_sampling(xyz, npoint) + ctx.mark_non_differentiable(fps_inds) + return fps_inds + + @staticmethod + def backward(xyz, a=None): + return None, None + + +furthest_point_sample = FurthestPointSampling.apply + + +class GatherOperation(Function): + @staticmethod + def forward(ctx, features, idx): + # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + features : torch.Tensor + (B, C, N) tensor + + idx : torch.Tensor + (B, npoint) tensor of the features to gather + + Returns + ------- + torch.Tensor + (B, C, npoint) tensor + """ + + _, C, N = features.size() + + ctx.for_backwards = (idx, C, N) + + return _ext.gather_points(features, idx) + + @staticmethod + def backward(ctx, grad_out): + idx, C, N = ctx.for_backwards + + grad_features = _ext.gather_points_grad(grad_out.contiguous(), idx, N) + return grad_features, None + + +gather_operation = GatherOperation.apply + + +class ThreeNN(Function): + @staticmethod + def forward(ctx, unknown, known): + # type: (Any, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor] + r""" + Find the three nearest neighbors of unknown in known + Parameters + ---------- + unknown : torch.Tensor + (B, n, 3) tensor of known features + known : torch.Tensor + (B, m, 3) tensor of unknown features + + Returns + ------- + dist : torch.Tensor + (B, n, 3) l2 distance to the three nearest neighbors + idx : torch.Tensor + (B, n, 3) index of 3 nearest neighbors + """ + dist2, idx = _ext.three_nn(unknown, known) + + return torch.sqrt(dist2), idx + + @staticmethod + def backward(ctx, a=None, b=None): + return None, None + + +three_nn = ThreeNN.apply + + +class ThreeInterpolate(Function): + @staticmethod + def forward(ctx, features, idx, weight): + # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor + r""" + Performs weight linear interpolation on 3 features + Parameters + ---------- + features : torch.Tensor + (B, c, m) Features descriptors to be interpolated from + idx : torch.Tensor + (B, n, 3) three nearest neighbors of the target features in features + weight : torch.Tensor + (B, n, 3) weights + + Returns + ------- + torch.Tensor + (B, c, n) tensor of the interpolated features + """ + B, c, m = features.size() + n = idx.size(1) + + ctx.three_interpolate_for_backward = (idx, weight, m) + + return _ext.three_interpolate(features, idx, weight) + + @staticmethod + def backward(ctx, grad_out): + # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + r""" + Parameters + ---------- + grad_out : torch.Tensor + (B, c, n) tensor with gradients of ouputs + + Returns + ------- + grad_features : torch.Tensor + (B, c, m) tensor with gradients of features + + None + + None + """ + idx, weight, m = ctx.three_interpolate_for_backward + + grad_features = _ext.three_interpolate_grad( + grad_out.contiguous(), idx, weight, m + ) + + return grad_features, None, None + + +three_interpolate = ThreeInterpolate.apply + + +class GroupingOperation(Function): + @staticmethod + def forward(ctx, features, idx): + # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + features : torch.Tensor + (B, C, N) tensor of features to group + idx : torch.Tensor + (B, npoint, nsample) tensor containing the indicies of features to group with + + Returns + ------- + torch.Tensor + (B, C, npoint, nsample) tensor + """ + B, nfeatures, nsample = idx.size() + _, C, N = features.size() + + ctx.for_backwards = (idx, N) + + return _ext.group_points(features, idx) + + @staticmethod + def backward(ctx, grad_out): + # type: (Any, torch.tensor) -> Tuple[torch.Tensor, torch.Tensor] + r""" + + Parameters + ---------- + grad_out : torch.Tensor + (B, C, npoint, nsample) tensor of the gradients of the output from forward + + Returns + ------- + torch.Tensor + (B, C, N) gradient of the features + None + """ + idx, N = ctx.for_backwards + + grad_features = _ext.group_points_grad(grad_out.contiguous(), idx, N) + + return grad_features, None + + +grouping_operation = GroupingOperation.apply + + +class BallQuery(Function): + @staticmethod + def forward(ctx, radius, nsample, xyz, new_xyz): + # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + radius : float + radius of the balls + nsample : int + maximum number of features in the balls + xyz : torch.Tensor + (B, N, 3) xyz coordinates of the features + new_xyz : torch.Tensor + (B, npoint, 3) centers of the ball query + + Returns + ------- + torch.Tensor + (B, npoint, nsample) tensor with the indicies of the features that form the query balls + """ + inds = _ext.ball_query(new_xyz, xyz, radius, nsample) + ctx.mark_non_differentiable(inds) + return inds + + @staticmethod + def backward(ctx, a=None): + return None, None, None, None + + +ball_query = BallQuery.apply + + +class QueryAndGroup(nn.Module): + r""" + Groups with a ball query of radius + + Parameters + --------- + radius : float32 + Radius of ball + nsample : int32 + Maximum number of features to gather in the ball + """ + + def __init__( + self, + radius, + nsample, + use_xyz=True, + ret_grouped_xyz=False, + normalize_xyz=False, + sample_uniformly=False, + ret_unique_cnt=False, + ): + super(QueryAndGroup, self).__init__() + self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz + self.ret_grouped_xyz = ret_grouped_xyz + self.normalize_xyz = normalize_xyz + self.sample_uniformly = sample_uniformly + self.ret_unique_cnt = ret_unique_cnt + if self.ret_unique_cnt: + assert self.sample_uniformly + + def forward(self, xyz, new_xyz, features=None): + # type: (QueryAndGroup, torch.Tensor. torch.Tensor, torch.Tensor) -> Tuple[Torch.Tensor] + r""" + Parameters + ---------- + xyz : torch.Tensor + xyz coordinates of the features (B, N, 3) + new_xyz : torch.Tensor + centriods (B, npoint, 3) + features : torch.Tensor + Descriptors of the features (B, C, N) + + Returns + ------- + new_features : torch.Tensor + (B, 3 + C, npoint, nsample) tensor + """ + idx = ball_query(self.radius, self.nsample, xyz, new_xyz) + + if self.sample_uniformly: + unique_cnt = torch.zeros((idx.shape[0], idx.shape[1])) + for i_batch in range(idx.shape[0]): + for i_region in range(idx.shape[1]): + unique_ind = torch.unique(idx[i_batch, i_region, :]) + num_unique = unique_ind.shape[0] + unique_cnt[i_batch, i_region] = num_unique + sample_ind = torch.randint( + 0, + num_unique, + (self.nsample - num_unique,), + dtype=torch.long, + ) + all_ind = torch.cat((unique_ind, unique_ind[sample_ind])) + idx[i_batch, i_region, :] = all_ind + + xyz_trans = xyz.transpose(1, 2).contiguous() + grouped_xyz = grouping_operation( + xyz_trans, idx + ) # (B, 3, npoint, nsample) + grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1) + if self.normalize_xyz: + grouped_xyz /= self.radius + + if features is not None: + grouped_features = grouping_operation(features, idx) + if self.use_xyz: + new_features = torch.cat( + [grouped_xyz, grouped_features], dim=1 + ) # (B, C + 3, npoint, nsample) + else: + new_features = grouped_features + else: + assert ( + self.use_xyz + ), "Cannot have not features and not use xyz as a feature!" + new_features = grouped_xyz + + ret = [new_features] + if self.ret_grouped_xyz: + ret.append(grouped_xyz) + if self.ret_unique_cnt: + ret.append(unique_cnt) + if len(ret) == 1: + return ret[0] + else: + return tuple(ret) + + +class GroupAll(nn.Module): + r""" + Groups all features + + Parameters + --------- + """ + + def __init__(self, use_xyz=True, ret_grouped_xyz=False): + # type: (GroupAll, bool) -> None + super(GroupAll, self).__init__() + self.use_xyz = use_xyz + + def forward(self, xyz, new_xyz, features=None): + # type: (GroupAll, torch.Tensor, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor] + r""" + Parameters + ---------- + xyz : torch.Tensor + xyz coordinates of the features (B, N, 3) + new_xyz : torch.Tensor + Ignored + features : torch.Tensor + Descriptors of the features (B, C, N) + + Returns + ------- + new_features : torch.Tensor + (B, C + 3, 1, N) tensor + """ + + grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) + if features is not None: + grouped_features = features.unsqueeze(2) + if self.use_xyz: + new_features = torch.cat( + [grouped_xyz, grouped_features], dim=1 + ) # (B, 3 + C, 1, N) + else: + new_features = grouped_features + else: + new_features = grouped_xyz + + if self.ret_grouped_xyz: + return new_features, grouped_xyz + else: + return new_features diff --git a/models/Mask3D/third_party/pointnet2/pointnet2/pytorch_utils.py b/models/Mask3D/third_party/pointnet2/pointnet2/pytorch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..59ece654dcda0cc3c0bb25c84f63bd06563dcfcd --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/pointnet2/pytorch_utils.py @@ -0,0 +1,283 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +""" Modified based on Ref: https://github.com/erikwijmans/Pointnet2_PyTorch """ +import torch +import torch.nn as nn +from typing import List, Tuple + + +class SharedMLP(nn.Sequential): + def __init__( + self, + args: List[int], + *, + bn: bool = False, + activation=nn.ReLU(inplace=True), + preact: bool = False, + first: bool = False, + name: str = "" + ): + super().__init__() + + for i in range(len(args) - 1): + self.add_module( + name + "layer{}".format(i), + Conv2d( + args[i], + args[i + 1], + bn=(not first or not preact or (i != 0)) and bn, + activation=activation + if (not first or not preact or (i != 0)) + else None, + preact=preact, + ), + ) + + +class _BNBase(nn.Sequential): + def __init__(self, in_size, batch_norm=None, name=""): + super().__init__() + self.add_module(name + "bn", batch_norm(in_size)) + + nn.init.constant_(self[0].weight, 1.0) + nn.init.constant_(self[0].bias, 0) + + +class BatchNorm1d(_BNBase): + def __init__(self, in_size: int, *, name: str = ""): + super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name) + + +class BatchNorm2d(_BNBase): + def __init__(self, in_size: int, name: str = ""): + super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name) + + +class BatchNorm3d(_BNBase): + def __init__(self, in_size: int, name: str = ""): + super().__init__(in_size, batch_norm=nn.BatchNorm3d, name=name) + + +class _ConvBase(nn.Sequential): + def __init__( + self, + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=None, + batch_norm=None, + bias=True, + preact=False, + name="", + ): + super().__init__() + + bias = bias and (not bn) + conv_unit = conv( + in_size, + out_size, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=bias, + ) + init(conv_unit.weight) + if bias: + nn.init.constant_(conv_unit.bias, 0) + + if bn: + if not preact: + bn_unit = batch_norm(out_size) + else: + bn_unit = batch_norm(in_size) + + if preact: + if bn: + self.add_module(name + "bn", bn_unit) + + if activation is not None: + self.add_module(name + "activation", activation) + + self.add_module(name + "conv", conv_unit) + + if not preact: + if bn: + self.add_module(name + "bn", bn_unit) + + if activation is not None: + self.add_module(name + "activation", activation) + + +class Conv1d(_ConvBase): + def __init__( + self, + in_size: int, + out_size: int, + *, + kernel_size: int = 1, + stride: int = 1, + padding: int = 0, + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=nn.init.kaiming_normal_, + bias: bool = True, + preact: bool = False, + name: str = "" + ): + super().__init__( + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=nn.Conv1d, + batch_norm=BatchNorm1d, + bias=bias, + preact=preact, + name=name, + ) + + +class Conv2d(_ConvBase): + def __init__( + self, + in_size: int, + out_size: int, + *, + kernel_size: Tuple[int, int] = (1, 1), + stride: Tuple[int, int] = (1, 1), + padding: Tuple[int, int] = (0, 0), + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=nn.init.kaiming_normal_, + bias: bool = True, + preact: bool = False, + name: str = "" + ): + super().__init__( + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=nn.Conv2d, + batch_norm=BatchNorm2d, + bias=bias, + preact=preact, + name=name, + ) + + +class Conv3d(_ConvBase): + def __init__( + self, + in_size: int, + out_size: int, + *, + kernel_size: Tuple[int, int, int] = (1, 1, 1), + stride: Tuple[int, int, int] = (1, 1, 1), + padding: Tuple[int, int, int] = (0, 0, 0), + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=nn.init.kaiming_normal_, + bias: bool = True, + preact: bool = False, + name: str = "" + ): + super().__init__( + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=nn.Conv3d, + batch_norm=BatchNorm3d, + bias=bias, + preact=preact, + name=name, + ) + + +class FC(nn.Sequential): + def __init__( + self, + in_size: int, + out_size: int, + *, + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=None, + preact: bool = False, + name: str = "" + ): + super().__init__() + + fc = nn.Linear(in_size, out_size, bias=not bn) + if init is not None: + init(fc.weight) + if not bn: + nn.init.constant_(fc.bias, 0) + + if preact: + if bn: + self.add_module(name + "bn", BatchNorm1d(in_size)) + + if activation is not None: + self.add_module(name + "activation", activation) + + self.add_module(name + "fc", fc) + + if not preact: + if bn: + self.add_module(name + "bn", BatchNorm1d(out_size)) + + if activation is not None: + self.add_module(name + "activation", activation) + + +def set_bn_momentum_default(bn_momentum): + def fn(m): + if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)): + m.momentum = bn_momentum + + return fn + + +class BNMomentumScheduler(object): + def __init__( + self, model, bn_lambda, last_epoch=-1, setter=set_bn_momentum_default + ): + if not isinstance(model, nn.Module): + raise RuntimeError( + "Class '{}' is not a PyTorch nn Module".format( + type(model).__name__ + ) + ) + + self.model = model + self.setter = setter + self.lmbd = bn_lambda + + self.step(last_epoch + 1) + self.last_epoch = last_epoch + + def step(self, epoch=None): + if epoch is None: + epoch = self.last_epoch + 1 + + self.last_epoch = epoch + self.model.apply(self.setter(self.lmbd(epoch))) diff --git a/models/Mask3D/third_party/pointnet2/setup.py b/models/Mask3D/third_party/pointnet2/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..2c0a7e1a9723a71548a6e0a0dcc349fa698ee142 --- /dev/null +++ b/models/Mask3D/third_party/pointnet2/setup.py @@ -0,0 +1,40 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from setuptools import find_packages, setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension +import glob +import os.path as osp + +this_dir = osp.dirname(osp.abspath(__file__)) + +_ext_src_root = "pointnet2/_ext_src" +_ext_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root)) + glob.glob( + "{}/src/*.cu".format(_ext_src_root) +) +_ext_headers = glob.glob("{}/include/*".format(_ext_src_root)) + +setup( + name="pointnet2", + ext_modules=[ + CUDAExtension( + name="pointnet2._ext", + sources=_ext_sources, + extra_compile_args={ + "cxx": [ + "-O2", + "-I{}".format("{}/include".format(_ext_src_root)), + ], + "nvcc": [ + "-O2", + "-I{}".format("{}/include".format(_ext_src_root)), + ], + }, + include_dirs=[osp.join(this_dir, _ext_src_root, "include")], + ) + ], + cmdclass={"build_ext": BuildExtension}, + packages=find_packages(), +) diff --git a/models/YOLO-World/.dockerignore b/models/YOLO-World/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..1aefdd171f6113a7e143e357e7fc0a804d8121ae --- /dev/null +++ b/models/YOLO-World/.dockerignore @@ -0,0 +1,2 @@ +docs +Dockerfile \ No newline at end of file diff --git a/models/YOLO-World/.gitattributes b/models/YOLO-World/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a86bbf76e750c2c73f109d8eecd23298d0a5a6dd --- /dev/null +++ b/models/YOLO-World/.gitattributes @@ -0,0 +1,34 @@ +# Basic .gitattributes for a python repo. + +# Source files +# ============ +*.pxd text diff=python +*.py text diff=python +*.py3 text diff=python +*.pyw text diff=python +*.pyx text diff=python +*.pyz text diff=python +*.pyi text diff=python + +# Binary files +# ============ +*.db binary +*.p binary +*.pkl binary +*.pickle binary +*.pyc binary export-ignore +*.pyo binary export-ignore +*.pyd binary + +# Jupyter notebook +*.ipynb text eol=lf + +# Others +* text=auto +*.txt text +*.sh text eol=lf + +# Note: .db, .p, and .pkl files are associated +# with the python modules ``pickle``, ``dbm.*``, +# ``shelve``, ``marshal``, ``anydbm``, & ``bsddb`` +# (among others). diff --git a/models/YOLO-World/.gitmodules b/models/YOLO-World/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..4d97e1db083248de331235e176a80e5d5fc4b85b --- /dev/null +++ b/models/YOLO-World/.gitmodules @@ -0,0 +1,3 @@ +[submodule "third_party/mmyolo"] + path = third_party/mmyolo + url = https://github.com/onuralpszr/mmyolo.git diff --git a/models/YOLO-World/Dockerfile b/models/YOLO-World/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..4869c792851577c052bee84aa2ead917cef3d96a --- /dev/null +++ b/models/YOLO-World/Dockerfile @@ -0,0 +1,43 @@ +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 + +ARG MODEL="yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py" +ARG WEIGHT="yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth" + +ENV FORCE_CUDA="1" +ENV MMCV_WITH_OPS=1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3-pip \ + libgl1-mesa-glx \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libglib2.0-0 \ + git \ + python3-dev \ + python3-wheel + +RUN pip3 install --upgrade pip \ + && pip3 install \ + gradio \ + opencv-python \ + supervision \ + mmengine \ + setuptools \ + openmim \ + && mim install mmcv==2.0.0 \ + && pip3 install --no-cache-dir --index-url https://download.pytorch.org/whl/cu118 \ + wheel \ + torch \ + torchvision \ + torchaudio + +COPY . /yolo +WORKDIR /yolo + +RUN pip3 install -e . + +RUN curl -o weights/$WEIGHT -L https://huggingface.co/wondervictor/YOLO-World/resolve/main/$WEIGHT + +ENTRYPOINT [ "python3", "demo.py" ] +CMD ["configs/pretrain/$MODEL", "weights/$WEIGHT"] \ No newline at end of file diff --git a/models/YOLO-World/LICENSE b/models/YOLO-World/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..dd8b209f0bc925e778312b84191a75023c38eb87 --- /dev/null +++ b/models/YOLO-World/LICENSE @@ -0,0 +1,1347 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. \ No newline at end of file diff --git a/models/YOLO-World/README.md b/models/YOLO-World/README.md new file mode 100644 index 0000000000000000000000000000000000000000..90f85f319e78f24bbdc01e7a3328f95c6bcf37aa --- /dev/null +++ b/models/YOLO-World/README.md @@ -0,0 +1,231 @@ +
+ +
+Tianheng Cheng2,3,*, +Lin Song1,📧,*, +Yixiao Ge1,🌟,2, + Wenyu Liu3, +Xinggang Wang3,📧, +Ying Shan1,2 +
+ +\* Equal contribution 🌟 Project lead 📧 Corresponding author + +1 Tencent AI Lab, 2 ARC Lab, Tencent PCG +3 Huazhong University of Science and Technology +
+
+ +[![arxiv paper](https://img.shields.io/badge/Project-Page-green)](https://wondervictor.github.io/) +[![arxiv paper](https://img.shields.io/badge/arXiv-Paper-red)](https://arxiv.org/abs/2401.17270) +Open In Colab +[![demo](https://img.shields.io/badge/🤗HugginngFace-Spaces-orange)](https://huggingface.co/spaces/stevengrove/YOLO-World) +[![Replicate](https://replicate.com/zsxkib/yolo-world/badge)](https://replicate.com/zsxkib/yolo-world) +[![hfpaper](https://img.shields.io/badge/🤗HugginngFace-Paper-yellow)](https://huggingface.co/papers/2401.17270) +[![license](https://img.shields.io/badge/License-GPLv3.0-blue)](LICENSE) +[![yoloworldseg](https://img.shields.io/badge/YOLOWorldxEfficientSAM-🤗Spaces-orange)](https://huggingface.co/spaces/SkalskiP/YOLO-World) +[![yologuide](https://img.shields.io/badge/📖Notebook-roboflow-purple)](https://supervision.roboflow.com/develop/notebooks/zero-shot-object-detection-with-yolo-world) +[![deploy](https://media.roboflow.com/deploy.svg)](https://inference.roboflow.com/foundation/yolo_world/) + +
+
+ +## Notice + +We recommend that everyone **use English to communicate on issues**, as this helps developers from around the world discuss, share experiences, and answer questions together. + +## 🔥 Updates +`[2024-5-18]:` YOLO-World models have been [integrated with the FiftyOne computer vision toolkit](https://docs.voxel51.com/integrations/ultralytics.html#open-vocabulary-detection) for streamlined open-vocabulary inference across image and video datasets. +`[2024-5-16]:` Hey guys! Long time no see! This update contains (1) [fine-tuning guide](https://github.com/AILab-CVC/YOLO-World?#highlights--introduction) and (2) [TFLite Export](./docs/tflite_deploy.md) with INT8 Quantization. +`[2024-5-9]:` This update contains the real [`reparameterization`](./docs/reparameterize.md) 🪄, and it's better for fine-tuning on custom datasets and improves the training/inference efficiency 🚀! +`[2024-4-28]:` Long time no see! This update contains bugfixs and improvements: (1) ONNX demo; (2) image demo (support tensor input); (2) new pre-trained models; (3) image prompts; (4) simple version for fine-tuning / deployment; (5) guide for installation (include a `requirements.txt`). +`[2024-3-28]:` We provide: (1) more high-resolution pre-trained models (e.g., S, M, X) ([#142](https://github.com/AILab-CVC/YOLO-World/issues/142)); (2) pre-trained models with CLIP-Large text encoders. Most importantly, we preliminarily fix the **fine-tuning without `mask-refine`** and explore a new fine-tuning setting ([#160](https://github.com/AILab-CVC/YOLO-World/issues/160),[#76](https://github.com/AILab-CVC/YOLO-World/issues/76)). In addition, fine-tuning YOLO-World with `mask-refine` also obtains significant improvements, check more details in [configs/finetune_coco](./configs/finetune_coco/). +`[2024-3-16]:` We fix the bugs about the demo ([#110](https://github.com/AILab-CVC/YOLO-World/issues/110),[#94](https://github.com/AILab-CVC/YOLO-World/issues/94),[#129](https://github.com/AILab-CVC/YOLO-World/issues/129), [#125](https://github.com/AILab-CVC/YOLO-World/issues/125)) with visualizations of segmentation masks, and release [**YOLO-World with Embeddings**](./docs/prompt_yolo_world.md), which supports prompt tuning, text prompts and image prompts. +`[2024-3-3]:` We add the **high-resolution YOLO-World**, which supports `1280x1280` resolution with higher accuracy and better performance for small objects! +`[2024-2-29]:` We release the newest version of [ **YOLO-World-v2**](./docs/updates.md) with higher accuracy and faster speed! We hope the community can join us to improve YOLO-World! +`[2024-2-28]:` Excited to announce that YOLO-World has been accepted by **CVPR 2024**! We're continuing to make YOLO-World faster and stronger, as well as making it better to use for all. +`[2024-2-22]:` We sincerely thank [RoboFlow](https://roboflow.com/) and [@Skalskip92](https://twitter.com/skalskip92) for the [**Video Guide**](https://www.youtube.com/watch?v=X7gKBGVz4vs) about YOLO-World, nice work! +`[2024-2-18]:` We thank [@Skalskip92](https://twitter.com/skalskip92) for developing the wonderful segmentation demo via connecting YOLO-World and EfficientSAM. You can try it now at the [🤗 HuggingFace Spaces](https://huggingface.co/spaces/SkalskiP/YOLO-World). +`[2024-2-17]:` The largest model **X** of YOLO-World is released, which achieves better zero-shot performance! +`[2024-2-17]:` We release the code & models for **YOLO-World-Seg** now! YOLO-World now supports open-vocabulary / zero-shot object segmentation! +`[2024-2-15]:` The pre-traind YOLO-World-L with CC3M-Lite is released! +`[2024-2-14]:` We provide the [`image_demo`](demo.py) for inference on images or directories. +`[2024-2-10]:` We provide the [fine-tuning](./docs/finetuning.md) and [data](./docs/data.md) details for fine-tuning YOLO-World on the COCO dataset or the custom datasets! +`[2024-2-3]:` We support the `Gradio` demo now in the repo and you can build the YOLO-World demo on your own device! +`[2024-2-1]:` We've released the code and weights of YOLO-World now! +`[2024-2-1]:` We deploy the YOLO-World demo on [HuggingFace 🤗](https://huggingface.co/spaces/stevengrove/YOLO-World), you can try it now! +`[2024-1-31]:` We are excited to launch **YOLO-World**, a cutting-edge real-time open-vocabulary object detector. + + +## TODO + +YOLO-World is under active development and please stay tuned ☕️! +If you have suggestions📃 or ideas💡,**we would love for you to bring them up in the [Roadmap](https://github.com/AILab-CVC/YOLO-World/issues/109)** ❤️! +> YOLO-World 目前正在积极开发中📃,如果你有建议或者想法💡,**我们非常希望您在 [Roadmap](https://github.com/AILab-CVC/YOLO-World/issues/109) 中提出来** ❤️! + +## [FAQ (Frequently Asked Questions)](https://github.com/AILab-CVC/YOLO-World/discussions/149) + +We have set up an FAQ about YOLO-World in the discussion on GitHub. We hope everyone can raise issues or solutions during use here, and we also hope that everyone can quickly find solutions from it. + +> 我们在GitHub的discussion中建立了关于YOLO-World的常见问答,这里将收集一些常见问题,同时大家可以在此提出使用中的问题或者解决方案,也希望大家能够从中快速寻找到解决方案 + + +## Highlights & Introduction + +This repo contains the PyTorch implementation, pre-trained weights, and pre-training/fine-tuning code for YOLO-World. + +* YOLO-World is pre-trained on large-scale datasets, including detection, grounding, and image-text datasets. + +* YOLO-World is the next-generation YOLO detector, with a strong open-vocabulary detection capability and grounding ability. + +* YOLO-World presents a *prompt-then-detect* paradigm for efficient user-vocabulary inference, which re-parameterizes vocabulary embeddings as parameters into the model and achieve superior inference speed. You can try to export your own detection model without extra training or fine-tuning in our [online demo](https://huggingface.co/spaces/stevengrove/YOLO-World)! + + +
+ +
+## Model Zoo + +We've pre-trained YOLO-World-S/M/L from scratch and evaluate on the `LVIS val-1.0` and `LVIS minival`. We provide the pre-trained model weights and training logs for applications/research or re-producing the results. + +### Zero-shot Inference on LVIS dataset + +
+ +| model | Pre-train Data | Size | APmini | APr | APc | APf | APval | APr | APc | APf | weights | +| :------------------------------------------------------------------------------------------------------------------- | :------------------- | :----------------- | :--------------: | :------------: | :------------: | :------------: | :-------------: | :------------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 22.7 | 16.3 | 20.8 | 25.5 | 17.3 | 11.3 | 14.9 | 22.7 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea.pth)| +| [YOLO-Worldv2-S](./configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) | O365+GoldG | 1280🔸 | 24.1 | 18.7 | 22.0 | 26.9 | 18.8 | 14.1 | 16.3 | 23.8 |[HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_obj365v1_goldg_pretrain_1280ft-fc4ff4f7.pth)| +| [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 30.0 | 25.0 | 27.2 | 33.4 | 23.5 | 17.1 | 20.0 | 30.1 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain-c6237d5b.pth)| +| [YOLO-Worldv2-M](./configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) | O365+GoldG | 1280🔸 | 31.6 | 24.5 | 29.0 | 35.1 | 25.3 | 19.3 | 22.0 | 31.7 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_obj365v1_goldg_pretrain_1280ft-77d0346d.pth)| +| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 33.0 | 22.6 | 32.0 | 35.8 | 26.0 | 18.6 | 23.0 | 32.6 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth)| +| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py) | O365+GoldG | 1280🔸 | 34.6 | 29.2 | 32.8 | 37.2 | 27.6 | 21.9 | 24.2 | 34.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth)| +| [YOLO-Worldv2-L (CLIP-Large)](./configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) 🔥 | O365+GoldG | 640 | 34.0 | 22.0 | 32.6 | 37.4 | 27.1 | 19.9 | 23.9 | 33.9 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_clip_large_o365v1_goldg_pretrain-8ff2e744.pth)| +| [YOLO-Worldv2-L (CLIP-Large)](./configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py) 🔥 | O365+GoldG | 800🔸 | 35.5 | 28.3 | 33.2 | 38.8 | 28.6 | 22.0 | 25.1 | 35.4 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_clip_large_o365v1_goldg_pretrain_800ft-9df82e55.pth)| +| [YOLO-Worldv2-L](./configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 32.9 | 25.3 | 31.1 | 35.8 | 26.1 | 20.6 | 22.6 | 32.3 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth)| +| [YOLO-Worldv2-X](./configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 35.4 | 28.7 | 32.9 | 38.7 | 28.4 | 20.6 | 25.6 | 35.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain-8698fbfa.pth) | +| 🔥 [YOLO-Worldv2-X]() | O365+GoldG+CC3M-Lite | 1280🔸 | 37.4 | 30.5 | 35.2 | 40.7 | 29.8 | 21.1 | 26.8 | 37.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth) | +| [YOLO-Worldv2-XL](./configs/pretrain/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 36.0 | 25.8 | 34.1 | 39.5 | 29.1 | 21.1 | 26.3 | 35.8 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_xl_obj365v1_goldg_cc3mlite_pretrain-5daf1395.pth) | + + +
+ +**NOTE:** +1. APmini: evaluated on LVIS `minival`. +3. APval: evaluated on LVIS `val 1.0`. +4. [HuggingFace Mirror](https://hf-mirror.com/) provides the mirror of HuggingFace, which is a choice for users who are unable to reach. +5. 🔸: fine-tuning models with the pre-trained data. + +**Pre-training Logs:** + +We provide the pre-training logs of `YOLO-World-v2`. Due to the unexpected errors of the local machines, the training might be interrupted several times. + +| Model | YOLO-World-v2-S | YOLO-World-v2-M | YOLO-World-v2-L | YOLO-World-v2-X | +| :--- | :-------------: | :--------------: | :-------------: | :-------------: | +|Pre-training Log | [Part-1](https://drive.google.com/file/d/1oib7pKfA2h1U_5-85H_s0Nz8jWd0R-WP/view?usp=drive_link), [Part-2](https://drive.google.com/file/d/11cZ6OZy80VTvBlZy3kzLAHCxx5Iix5-n/view?usp=drive_link) | [Part-1](https://drive.google.com/file/d/1E6vYSS8kBipGc8oQnsjAfeUAx8I9yOX7/view?usp=drive_link), [Part-2](https://drive.google.com/file/d/1fbM7vt2tgSeB8o_7tUDofWvpPNSViNj5/view?usp=drive_link) | [Part-1](https://drive.google.com/file/d/1Tola1QGJZTL6nGy3SBxKuknfNfREDm8J/view?usp=drive_link), [Part-2](https://drive.google.com/file/d/1mTBXniioUb0CdctCG4ckIU6idGo0NnH8/view?usp=drive_link) | [Final part](https://drive.google.com/file/d/1aEUA_EPQbXOrpxHTQYB6ieGXudb1PLpd/view?usp=drive_link)| + + +## Getting started + +### 1. Installation + +YOLO-World is developed based on `torch==1.11.0` `mmyolo==0.6.0` and `mmdetection==3.0.0`. Check more details about `requirements` and `mmcv` in [docs/installation](./docs/installation.md). + +#### Clone Project + +```bash +git clone --recursive https://github.com/AILab-CVC/YOLO-World.git +``` +#### Install + +```bash +pip install torch wheel -q +pip install -e . +``` + +### 2. Preparing Data + +We provide the details about the pre-training data in [docs/data](./docs/data.md). + + +## Training & Evaluation + +We adopt the default [training](./tools/train.py) or [evaluation](./tools/test.py) scripts of [mmyolo](https://github.com/open-mmlab/mmyolo). +We provide the configs for pre-training and fine-tuning in `configs/pretrain` and `configs/finetune_coco`. +Training YOLO-World is easy: + +```bash +chmod +x tools/dist_train.sh +# sample command for pre-training, use AMP for mixed-precision training +./tools/dist_train.sh configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py 8 --amp +``` +**NOTE:** YOLO-World is pre-trained on 4 nodes with 8 GPUs per node (32 GPUs in total). For pre-training, the `node_rank` and `nnodes` for multi-node training should be specified. + +Evaluating YOLO-World is also easy: + +```bash +chmod +x tools/dist_test.sh +./tools/dist_test.sh path/to/config path/to/weights 8 +``` + +**NOTE:** We mainly evaluate the performance on LVIS-minival for pre-training. + +## Fine-tuning YOLO-World + +
+ +
+ + +
+

Chose your pre-trained YOLO-World and Fine-tune it!

+
+ + +YOLO-World supports **zero-shot inference**, and three types of **fine-tuning recipes**: **(1) normal fine-tuning**, **(2) prompt tuning**, and **(3) reparameterized fine-tuning**. + +* Normal Fine-tuning: we provide the details about fine-tuning YOLO-World in [docs/fine-tuning](./docs/finetuning.md). + +* Prompt Tuning: we provide more details ahout prompt tuning in [docs/prompt_yolo_world](./docs/prompt_yolo_world.md). + +* Reparameterized Fine-tuning: the reparameterized YOLO-World is more suitable for specific domains far from generic scenes. You can find more details in [docs/reparameterize](./docs/reparameterize.md). + +## Deployment + +We provide the details about deployment for downstream applications in [docs/deployment](./docs/deploy.md). +You can directly download the ONNX model through the online [demo](https://huggingface.co/spaces/stevengrove/YOLO-World) in Huggingface Spaces 🤗. + +- [x] ONNX export and demo: [docs/deploy](https://github.com/AILab-CVC/YOLO-World/blob/master/docs/deploy.md) +- [x] TFLite and INT8 Quantization: [docs/tflite_deploy](https://github.com/AILab-CVC/YOLO-World/blob/master/docs/tflite_deploy.md) +- [ ] TensorRT: coming soon. +- [ ] C++: coming soon. + +## Demo + +See [`demo`](./demo) for more details + +- [x] `gradio_demo.py`: Gradio demo, ONNX export +- [x] `image_demo.py`: inference with images or a directory of images +- [x] `simple_demo.py`: a simple demo of YOLO-World, using `array` (instead of path as input). +- [x] `video_demo.py`: inference YOLO-World on videos. +- [x] `inference.ipynb`: jupyter notebook for YOLO-World. +- [x] [Google Colab Notebook](https://colab.research.google.com/drive/1F_7S5lSaFM06irBCZqjhbN7MpUXo6WwO?usp=sharing): We sincerely thank [Onuralp](https://github.com/onuralpszr) for sharing the [Colab Demo](https://colab.research.google.com/drive/1F_7S5lSaFM06irBCZqjhbN7MpUXo6WwO?usp=sharing), you can have a try 😊! + +## Acknowledgement + +We sincerely thank [mmyolo](https://github.com/open-mmlab/mmyolo), [mmdetection](https://github.com/open-mmlab/mmdetection), [GLIP](https://github.com/microsoft/GLIP), and [transformers](https://github.com/huggingface/transformers) for providing their wonderful code to the community! + +## Citations +If you find YOLO-World is useful in your research or applications, please consider giving us a star 🌟 and citing it. + +```bibtex +@inproceedings{Cheng2024YOLOWorld, + title={YOLO-World: Real-Time Open-Vocabulary Object Detection}, + author={Cheng, Tianheng and Song, Lin and Ge, Yixiao and Liu, Wenyu and Wang, Xinggang and Shan, Ying}, + booktitle={Proc. IEEE Conf. Computer Vision and Pattern Recognition (CVPR)}, + year={2024} +} +``` + +## Licence +YOLO-World is under the GPL-v3 Licence and is supported for commercial usage. If you need a commercial license for YOLO-World, please feel free to contact us. diff --git a/models/YOLO-World/configs/finetune_coco/README.md b/models/YOLO-World/configs/finetune_coco/README.md new file mode 100644 index 0000000000000000000000000000000000000000..954d64a5f593f0f984f6ddcf5c6cd96168b7179e --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/README.md @@ -0,0 +1,29 @@ +## Fine-tune YOLO-World on MS-COCO + + +### Updates + +1. [2024-3-27]: Considering that fine-tuning YOLO-World on COCO **without `mask-refine`** obtains bad results, e.g., YOLO-World-L obtains 48.6 AP without `mask-refine` compared to 53.3 AP with `mask-refine`, we rethink the training process and explore new training schemes for fine-tuning without `mask-refine`. +BTW, the COCO fine-tuning results are updated with higher performance (with `mask-refine`)! + + +### COCO Results and Checkpoints + +**NOTE:** +1. APZS: AP evaluated in the zero-shot setting (w/o fine-tuning on COCO dataset). +2. `mask-refine`: refine the box annotations with masks, and add `CopyPaste` augmentation during training. + +| model | Schedule | `mask-refine` | efficient neck | APZS| AP | AP50 | AP75 | weights | log | +| :---- | :-------: | :----------: |:-------------: | :------------: | :-: | :--------------:| :-------------: |:------: | :-: | +| [YOLO-World-v2-S](./yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 37.5 | 46.1 | 62.0 | 49.9 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-492dc329.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240327_110411.log) | +| [YOLO-World-v2-M](./yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 42.8 | 51.0 | 67.5 | 55.2 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-69c27ac7.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240327_110411.log) | +| [YOLO-World-v2-L](./yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 45.1 | 53.9 | 70.9 | 58.8 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-81c701ee.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240326_160313.log) | +| [YOLO-World-v2-X](./yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 46.8 | 54.7 | 71.6 | 59.6 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_ep80-76bc0cbd.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco_20240322_181232.log) | +| [YOLO-World-v2-L](./yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py) 🔥 | SGD, 1e-3, 40e | ✖️ | ✖️ | 45.1 | 52.8 | 69.5 | 57.8 | [HF Checkpoints](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco_ep80-e1288152.pth) | [log](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetuning_coco_20240327_014902.log) | + + +### Reparameterized Training + +| model | Schedule | `mask-refine` | efficient neck | APZS| AP | AP50 | AP75 | weights | log | +| :---- | :-------: | :----------: |:-------------: | :------------: | :-: | :--------------:| :-------------: |:------: | :-: | +| [YOLO-World-v2-S](./yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) | AdamW, 2e-4, 80e | ✔️ | ✖️ | 37.5 | 46.3 | 62.8 | 50.4 | [HF Checkpoints]() | [log]() | \ No newline at end of file diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..101a571dbf6a6c79d50c37dff98a2ac0698e91b7 --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py @@ -0,0 +1,179 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from='pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth' +persistent_workers = False + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..2ddbe50d4c63d7cd5953f9f096b57661ccb2f287 --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,181 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from='pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth' +persistent_workers = False + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b5cdca5069b2a76915b80d64e02fc0a44840899e --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,159 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth' +# huggingface text model +text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='EfficientCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), + *_base_.last_transform[:-1], *text_transform +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +coco_train_dataset = dict(_delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, + min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..275311837444f5f56f823825b98f474d211c7519 --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_efficient_neck_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,182 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='EfficientCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d01affe6182e271da2057bbd32b15f43200fd5c6 --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,181 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3978469fccb003aa02e6a39da38d1e6007706253 --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco.py @@ -0,0 +1,160 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 40 # Maximum training epochs +close_mosaic_epochs = 30 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 1e-3 +weight_decay = 0.0005 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# model settings +model = dict(type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict(type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict( + type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), + *_base_.last_transform[:-1], *text_transform +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] + +coco_train_dataset = dict(_delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, + min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='SGD', + lr=base_lr, + momentum=0.937, + nesterov=True, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..23608ba56c47df3bf19d5ddcaa12cf70d4f15ee6 --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,161 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 1e-3 +weight_decay = 0.0005 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# model settings +model = dict(type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict(type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict( + type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), + *_base_.last_transform[:-1], *text_transform +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +coco_train_dataset = dict(_delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, + min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='SGD', + lr=base_lr, + momentum=0.937, + nesterov=True, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..32fcc51cdffc459a3d11461174a989e6e3438688 --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_m_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,182 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_m_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-c6237d5b.pth' +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..498011019471f55cf525802c44bbc865f9a67655 --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_s_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,145 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = '../FastDet/output_models/pretrain_yolow-v8_s_clipv2_frozen_te_noprompt_t2i_bn_2e-3adamw_scale_lr_wd_32xb16-100e_obj365v1_goldg_cc3mram250k_train_lviseval-e3592307_rep_conv.pth' +persistent_workers = False +mixup_prob = 0.15 +copypaste_prob = 0.3 + +# model settings +model = dict(type='SimpleYOLOWorldDetector', + mm_neck=True, + num_train_classes=num_classes, + num_test_classes=num_classes, + reparameterized=True, + data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + text_model=None, + image_model={{_base_.model.backbone}}, + with_text_model=False), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='EfficientCSPLayerWithTwoConv')), + bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule', + embed_dims=text_channels, + num_guide=num_classes, + num_classes=num_classes)), + train_cfg=dict(assigner=dict(num_classes=num_classes))) + +# dataset settings +final_transform = [ + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +mosaic_affine_transform = [ + dict(type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), + *_base_.last_transform[:-1], *final_transform +] + +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *final_transform] + +coco_train_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +coco_val_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=test_pipeline) + +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..9fe682c87edc1c1c7c8e6d10f2c08e5f819b501f --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,146 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = '../FastDet/output_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea_rep_conv.pth' +persistent_workers = False +mixup_prob = 0.15 +copypaste_prob = 0.3 + +# model settings +model = dict(type='SimpleYOLOWorldDetector', + mm_neck=True, + num_train_classes=num_classes, + num_test_classes=num_classes, + reparameterized=True, + data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + text_model=None, + image_model={{_base_.model.backbone}}, + with_text_model=False), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=num_classes, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv', + guide_channels=num_classes)), + bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule', + embed_dims=text_channels, + num_guide=num_classes, + num_classes=num_classes)), + train_cfg=dict(assigner=dict(num_classes=num_classes))) + +# dataset settings +final_transform = [ + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +mosaic_affine_transform = [ + dict(type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), + *_base_.last_transform[:-1], *final_transform +] + +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *final_transform] + +coco_train_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +coco_val_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=test_pipeline) + +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..49d2e4bdffd9cb561399d694196b885be3524efe --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_s_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,184 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_s_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-55b943ea.pth' +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False +mixup_prob = 0.15 +copypaste_prob = 0.3 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..6ce88a89ab9ea18b0f91760d985f2313859c8762 --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_x_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,183 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/' + 'yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict( + imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_x_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc250k_train_lviseval-8698fbfa.pth' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/finetune_coco/yolo_world_v2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..4dc856605e8a38ef4d8d924a06b4ec13b0a7333b --- /dev/null +++ b/models/YOLO-World/configs/finetune_coco/yolo_world_v2_xl_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py @@ -0,0 +1,173 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# scaling model from X to XL +deepen_factor = 1.0 +widen_factor = 1.5 + +backbone = _base_.model.backbone +backbone.update(deepen_factor=deepen_factor, widen_factor=widen_factor) + +# model settings +model = dict(type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + image_model=backbone, + text_model=dict(type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict( + type='YOLOWorldHeadModule', + widen_factor=widen_factor, + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), + *_base_.last_transform[:-1], *text_transform +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +coco_train_dataset = dict(_delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, + min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..16067a6880b0e21f0b6ec06c98cf02626bec552e --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_800ft_lvis_minival.py @@ -0,0 +1,200 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 768 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.0125 +train_batch_size_per_gpu = 16 +# text_model_name = '../pretrained_models/clip-vit-large-patch14-336' +text_model_name = 'openai/clip-vit-large-patch14-336' +img_scale = (800, 800) + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] + +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform +] + +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..b5b84ab7a6724d25a0fb0678ed0f2f5f566afb1a --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_clip_large_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,171 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 768 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.0125 +train_batch_size_per_gpu = 16 +# text_model_name = '../pretrained_models/clip-vit-large-patch14-336' +text_model_name = 'openai/clip-vit-large-patch14-336' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..1c34f3a4c99d2676f98bdacd02561c3b7896ae36 --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py @@ -0,0 +1,202 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 20 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.025 +train_batch_size_per_gpu = 4 +load_from = "pretrained_models/yolo_world_v2_l_obj365v1_goldg_pretrain-a82b1fe3.pth" +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +img_scale = (1280, 1280) + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] + +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform +] + +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) + +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..cb8beec0af6f0fc4b0642f2f6fca4462e44eae60 --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,171 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py new file mode 100644 index 0000000000000000000000000000000000000000..70b19b287e03ea84131f9b8911b761f7eeaaa77e --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py @@ -0,0 +1,171 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_val.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_val.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..9a430e8499aa329799cb0c3edd59df21e2c42f9c --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py @@ -0,0 +1,198 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_m_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +img_scale = (1280, 1280) + +# text_model_name = 'openai/clip-vit-base-patch32' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] + +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform +] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..c8ce8129cfd6e08f7efdc09159f86d0f22351aa8 --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,171 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_m_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +text_model_name = 'openai/clip-vit-large-patch14-336' +# text_model_name = 'openai/clip-vit-base-patch32' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..264b026ca780dffc91236cf53908858488337541 --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,176 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_m_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv', + use_einsum=False)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes, + use_einsum=False)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] + +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] + +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..3afb76aa8aab584a99c623926b5a363c1a453d89 --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py @@ -0,0 +1,195 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.025 +train_batch_size_per_gpu = 4 +img_scale = (1280, 1280) + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform +] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..775cc8e7867e60f4fad8d24d492d982b3463e697 --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_s_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,170 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..ab4cd23f4628950bd31b01422f92a0a3ee50c683 --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py @@ -0,0 +1,199 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_x_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +img_scale = (1280, 1280) + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform +] + +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..e3c1226d6f10bd785a03eeccf1a669f9f6531062 --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,171 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_x_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..35977e8ed93ee68ef96a9a2b98ebe02d4c18abf8 --- /dev/null +++ b/models/YOLO-World/configs/pretrain/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,185 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_x_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' + +# scaling model from X to XL +deepen_factor = 1.0 +widen_factor = 1.5 + +backbone = _base_.model.backbone +backbone.update( + deepen_factor=deepen_factor, + widen_factor=widen_factor +) + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model=backbone, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + widen_factor=widen_factor, + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain_v1/README.md b/models/YOLO-World/configs/pretrain_v1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3290c7e0f7ab6b3bd10dd5b0ecaa5371d723f915 --- /dev/null +++ b/models/YOLO-World/configs/pretrain_v1/README.md @@ -0,0 +1,21 @@ +## Pre-training YOLO-World-v1 + +> The YOLO-World-v1 is an initial version, and now is nearly deprecated! We strongly suggest you use the [latest version](../pretrain/). + + + +### Zero-shot Inference on LVIS dataset + +| model | Pre-train Data | Size | APmini | APr | APc | APf | APval | APr | APc | APf | weights | +| :------------------------------------------------------------------------------------------------------------------- | :------------------- | :----------------- | :--------------: | :------------: | :------------: | :------------: | :-------------: | :------------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [YOLO-World-S](./yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 24.3 | 16.6 | 22.1 | 27.7 | 17.8 | 11.0 | 14.8 | 24.0 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_s_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-18bea4d2.pth) | +| [YOLO-World-M](./yolo_world_m_dual_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 28.6 | 19.7 | 26.6 | 31.9 | 22.3 | 16.2 | 19.0 | 28.7 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_m_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-2b7bd1be.pth) | +| [YOLO-World-L](./yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 640 | 32.5 | 22.3 | 30.6 | 36.1 | 24.8 | 17.8 | 22.4 | 32.5 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth) | +| [YOLO-World-L](./yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 33.0 | 23.6 | 32.0 | 35.5 | 25.3 | 18.0 | 22.1 | 32.1 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth) | +| [YOLO-World-X](./yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG+CC3M-Lite | 640 | 33.4 | 24.4 | 31.6 | 36.6 | 26.6 | 19.2 | 23.5 | 33.2 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_x_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-8cf6b025.pth) | + + +**NOTE:** +1. APmini: evaluated on LVIS `minival`. +3. APval: evaluated on LVIS `val 1.0`. +4. [HuggingFace Mirror](https://hf-mirror.com/) provides the mirror of HuggingFace, which is a choice for users who are unable to reach. \ No newline at end of file diff --git a/models/YOLO-World/configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..e88be2eb6f54cb19d066974548ea08239ac4127f --- /dev/null +++ b/models/YOLO-World/configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,172 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py b/models/YOLO-World/configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py new file mode 100644 index 0000000000000000000000000000000000000000..66333b10916d3e971d4d3c9e968ab91b48f28022 --- /dev/null +++ b/models/YOLO-World/configs/pretrain_v1/yolo_world_l_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_val.py @@ -0,0 +1,172 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_val.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_val.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain_v1/yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain_v1/yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..18c3be69dca57df960b428e46800fb7543d2c1da --- /dev/null +++ b/models/YOLO-World/configs/pretrain_v1/yolo_world_m_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,172 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_m_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain_v1/yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain_v1/yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..5441d0ff995889f9ebcef97c853daf69dbbc4564 --- /dev/null +++ b/models/YOLO-World/configs/pretrain_v1/yolo_world_s_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,172 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/pretrain_v1/yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/models/YOLO-World/configs/pretrain_v1/yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..f20d3f01cb6f3a301e726ed2d3f8e7b32b61f50f --- /dev/null +++ b/models/YOLO-World/configs/pretrain_v1/yolo_world_x_dual_vlpan_l2norm_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,172 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_x_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/models/YOLO-World/configs/prompt_tuning_coco/READEME.md b/models/YOLO-World/configs/prompt_tuning_coco/READEME.md new file mode 100644 index 0000000000000000000000000000000000000000..2888d1bf2ecb14d8f5d903d6aa0be38006bae204 --- /dev/null +++ b/models/YOLO-World/configs/prompt_tuning_coco/READEME.md @@ -0,0 +1,12 @@ +## Prompt Tuning for YOLO-World + +### NOTE: + +This folder contains many experimental config files, which will be removed later!! + +### Experimental Results + +| Model | Config | AP | AP50 | AP75 | APS | APM | APL | +| :---- | :----: | :--: | :--: | :---: | :-: | :-: | :-: | +| YOLO-World-v2-L | Zero-shot | 45.7 | 61.6 | 49.8 | 29.9 | 50.0 | 60.8 | +| [YOLO-World-v2-L](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py) | Prompt tuning | 47.9 | 64.3 | 52.5 | 31.9 | 52.6 | 61.3 | diff --git a/models/YOLO-World/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py b/models/YOLO-World/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3212fa05005d31c01823e103b65792832c4342da --- /dev/null +++ b/models/YOLO-World/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py @@ -0,0 +1,161 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +persistent_workers = False + +# model settings +model = dict(type='SimpleYOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + embedding_path='embeddings/clip_vit_b32_coco_80_embeddings.npy', + prompt_dim=text_channels, + num_prompts=80, + data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + text_model=None, + image_model={{_base_.model.backbone}}, + frozen_stages=4, + with_text_model=False), + neck=dict(type='YOLOWorldPAFPN', + freeze_all=True, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict( + type='YOLOWorldHeadModule', + freeze_all=True, + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +final_transform = [ + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +mosaic_affine_transform = [ + dict(type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]), + *_base_.last_transform[:-1], *final_transform +] + +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *final_transform] + +coco_train_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +coco_val_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=test_pipeline) + +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0), + 'embeddings': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') +find_unused_parameters = True diff --git a/models/YOLO-World/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_prompt_tuning_coco.py b/models/YOLO-World/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_prompt_tuning_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..64ce89d13a436d4aa04ed057b60f4586f8b350da --- /dev/null +++ b/models/YOLO-World/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_prompt_tuning_coco.py @@ -0,0 +1,117 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +persistent_workers = False + +# model settings +model = dict(type='SimpleYOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + embedding_path='embeddings/clip_vit_b32_coco_80_embeddings.npy', + prompt_dim=text_channels, + num_prompts=80, + freeze_prompt=False, + data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + text_model=None, + image_model={{_base_.model.backbone}}, + frozen_stages=4, + with_text_model=False), + neck=dict(type='YOLOWorldPAFPN', + freeze_all=True, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict( + type='YOLOWorldHeadModule', + freeze_all=True, + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +coco_train_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +coco_val_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.test_pipeline) + +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) + +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0), + 'embeddings': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_all_finetuning_coco.py b/models/YOLO-World/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_all_finetuning_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8caf5bd70769bfb5a728bb8c6d35448dd1ff9454 --- /dev/null +++ b/models/YOLO-World/configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_sgd_1e-3_80e_8gpus_all_finetuning_coco.py @@ -0,0 +1,109 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 40 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 1e-3 +weight_decay = 0.0005 +train_batch_size_per_gpu = 16 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +persistent_workers = False + +# model settings +model = dict(type='SimpleYOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + embedding_path='embeddings/clip_vit_b32_coco_80_embeddings.npy', + prompt_dim=text_channels, + num_prompts=80, + freeze_prompt=True, + data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), + backbone=dict(_delete_=True, + type='MultiModalYOLOBackbone', + text_model=None, + image_model={{_base_.model.backbone}}, + with_text_model=False), + neck=dict(type='YOLOWorldPAFPN', + freeze_all=False, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict( + type='YOLOWorldHeadModule', + freeze_all=False, + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +coco_train_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline) + +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +coco_val_dataset = dict(type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.test_pipeline) + +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) + +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='SGD', + lr=base_lr, + momentum=0.937, + nesterov=True, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu)) + +# evaluation settings +val_evaluator = dict(_delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') diff --git a/models/YOLO-World/configs/segmentation/README.md b/models/YOLO-World/configs/segmentation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8cfd30341ae20ab7ff9a24fb4df03825d29f0520 --- /dev/null +++ b/models/YOLO-World/configs/segmentation/README.md @@ -0,0 +1,27 @@ +## Fine-tuning YOLO-World for Instance Segmentation + + +### Models + +We fine-tune YOLO-World on LVIS (`LVIS-Base`) with mask annotations for open-vocabulary (zero-shot) instance segmentation. + +We provide two fine-tuning strategies YOLO-World towards open-vocabulary instance segmentation: + +* fine-tuning `all modules`: leads to better LVIS segmentation accuracy but affects the zero-shot performance. + +* fine-tuning the `segmentation head`: maintains the zero-shot performanc but lowers LVIS segmentation accuracy. + +| Model | Fine-tuning Data | Fine-tuning Modules| APmask | APr | APc | APf | Weights | +| :---- | :--------------- | :----------------: | :--------------: | :------------: | :------------: | :------------: | :-----: | +| [YOLO-World-Seg-M](./yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 25.9 | 13.4 | 24.9 | 32.6 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-ca465825.pth) | +| [YOLO-World-v2-Seg-M](./yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 25.9 | 13.4 | 24.9 | 32.6 | [HF Checkpoints 🤗]() | +| [YOLO-World-Seg-L](./yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 28.7 | 15.0 | 28.3 | 35.2| [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis-8c58c916.pth) | +| [YOLO-World-v2-Seg-L](./yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py) | `LVIS-Base` | `all modules` | 28.7 | 15.0 | 28.3 | 35.2| [HF Checkpoints 🤗]() | +| [YOLO-World-Seg-M](./yolo_seg_world_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 16.7 | 12.6 | 14.6 | 20.8 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-7bca59a7.pth) | +| [YOLO-World-v2-Seg-M](./yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 17.8 | 13.9 | 15.5 | 22.0 | [HF Checkpoints 🤗]() | +| [YOLO-World-Seg-L](yolo_seg_world_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 19.1 | 14.2 | 17.2 | 23.5 | [HF Checkpoints 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis-5a642d30.pth) | +| [YOLO-World-v2-Seg-L](./yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py) | `LVIS-Base` | `seg head` | 19.8 | 17.2 | 17.5 | 23.6 | [HF Checkpoints 🤗]() | +**NOTE:** +1. The mask AP are evaluated on the LVIS `val 1.0`. +2. All models are fine-tuned for 80 epochs on `LVIS-Base` (866 categories, `common + frequent`). +3. The YOLO-World-Seg with only `seg head` fine-tuned maintains the original zero-shot detection capability and segments objects. diff --git a/models/YOLO-World/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py b/models/YOLO-World/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..01885dd5461359eb0dd026886268b28449dc6a25 --- /dev/null +++ b/models/YOLO-World/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py @@ -0,0 +1,227 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py' +) +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 + +weight_decay = 0.05 +train_batch_size_per_gpu = 8 +load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth' +persistent_workers = False +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +# Polygon2Mask +downsample_ratio = 4 +mask_overlap = False +use_mask2refine = True +max_aspect_ratio = 100 +min_area_ratio = 0.01 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=[])), + neck=dict(type='YOLOWorldDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldSegHead', + head_module=dict(type='YOLOWorldSegHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes, + mask_channels=32, + proto_channels=256), + mask_overlap=mask_overlap, + loss_mask=dict(type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=1.0), + train_cfg=dict(assigner=dict(num_classes=num_training_classes)), + test_cfg=dict(mask_thr_binary=0.5, fast_test=True)) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=True) +] + +last_transform = [ + dict(type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict(type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', + 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='Polygon2Mask', + downsample_ratio=downsample_ratio, + mask_overlap=mask_overlap), +] + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=True) +] +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform, *text_transform +] + +_train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict(type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict(type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), *last_transform +] +train_pipeline_stage2 = [*_train_pipeline_stage2, *text_transform] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco', + ann_file='lvis/lvis_v1_train_base.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32)), + class_text_path='data/texts/lvis_v1_base_class_texts.json', + pipeline=train_pipeline) +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0), + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_val.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_val.json', + metric=['bbox', 'segm']) +test_evaluator = val_evaluator +find_unused_parameters = True diff --git a/models/YOLO-World/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py b/models/YOLO-World/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..5d4174ab893a289b9f75499f1fe11e43b638ab41 --- /dev/null +++ b/models/YOLO-World/configs/segmentation/yolo_world_seg_l_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py @@ -0,0 +1,237 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py' +) +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 + +weight_decay = 0.05 +train_batch_size_per_gpu = 8 +load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth' +persistent_workers = False + +# Polygon2Mask +downsample_ratio = 4 +mask_overlap = False +use_mask2refine = True +max_aspect_ratio = 100 +min_area_ratio = 0.01 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + frozen_stages=4, # frozen the image backbone + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldDualPAFPN', + freeze_all=True, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldSegHead', + head_module=dict(type='YOLOWorldSegHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes, + mask_channels=32, + proto_channels=256, + freeze_bbox=True), + mask_overlap=mask_overlap, + loss_mask=dict(type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=1.0), + train_cfg=dict(assigner=dict(num_classes=num_training_classes)), + test_cfg=dict(mask_thr_binary=0.5, fast_test=True)) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=True) +] + +last_transform = [ + dict(type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict(type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', + 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='Polygon2Mask', + downsample_ratio=downsample_ratio, + mask_overlap=mask_overlap), +] + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=True) +] +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform, *text_transform +] + +_train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict(type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict(type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), *last_transform +] +train_pipeline_stage2 = [*_train_pipeline_stage2, *text_transform] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco', + ann_file='lvis/lvis_v1_train_base.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32)), + class_text_path='data/texts/lvis_v1_base_class_texts.json', + pipeline=train_pipeline) +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0), + 'neck': + dict(lr_mult=0.0), + 'head.head_module.reg_preds': + dict(lr_mult=0.0), + 'head.head_module.cls_preds': + dict(lr_mult=0.0), + 'head.head_module.cls_contrasts': + dict(lr_mult=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_val.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_val.json', + metric=['bbox', 'segm']) +test_evaluator = val_evaluator +find_unused_parameters = True diff --git a/models/YOLO-World/configs/segmentation/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py b/models/YOLO-World/configs/segmentation/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..31331b663551f8f74af41d7efa6f9534dedf9738 --- /dev/null +++ b/models/YOLO-World/configs/segmentation/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_allmodules_finetune_lvis.py @@ -0,0 +1,226 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py' +) +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 + +weight_decay = 0.05 +train_batch_size_per_gpu = 8 +load_from = 'pretrained_models/yolo_world_m_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-2b7bd1be.pth' +persistent_workers = False + +# Polygon2Mask +downsample_ratio = 4 +mask_overlap = False +use_mask2refine = True +max_aspect_ratio = 100 +min_area_ratio = 0.01 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=[])), + neck=dict(type='YOLOWorldDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldSegHead', + head_module=dict(type='YOLOWorldSegHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes, + mask_channels=32, + proto_channels=256), + mask_overlap=mask_overlap, + loss_mask=dict(type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=1.0), + train_cfg=dict(assigner=dict(num_classes=num_training_classes)), + test_cfg=dict(mask_thr_binary=0.5, fast_test=True)) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=True) +] + +last_transform = [ + dict(type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict(type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', + 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='Polygon2Mask', + downsample_ratio=downsample_ratio, + mask_overlap=mask_overlap), +] + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=True) +] +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform, *text_transform +] + +_train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict(type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict(type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), *last_transform +] +train_pipeline_stage2 = [*_train_pipeline_stage2, *text_transform] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco', + ann_file='lvis/lvis_v1_train_base.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32)), + class_text_path='data/texts/lvis_v1_base_class_texts.json', + pipeline=train_pipeline) +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_val.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_val.json', + metric=['bbox', 'segm']) +test_evaluator = val_evaluator +find_unused_parameters = True diff --git a/models/YOLO-World/configs/segmentation/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py b/models/YOLO-World/configs/segmentation/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..883c3225d4e1bbfbcefe96f9028b9082324c2466 --- /dev/null +++ b/models/YOLO-World/configs/segmentation/yolo_world_seg_m_dual_vlpan_2e-4_80e_8gpus_seghead_finetune_lvis.py @@ -0,0 +1,237 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py' +) +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 + +weight_decay = 0.05 +train_batch_size_per_gpu = 8 +load_from = 'pretrained_models/yolo_world_m_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-2b7bd1be.pth' +persistent_workers = False + +# Polygon2Mask +downsample_ratio = 4 +mask_overlap = False +use_mask2refine = True +max_aspect_ratio = 100 +min_area_ratio = 0.01 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + frozen_stages=4, # frozen the image backbone + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldDualPAFPN', + freeze_all=True, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldSegHead', + head_module=dict(type='YOLOWorldSegHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes, + mask_channels=32, + proto_channels=256, + freeze_bbox=True), + mask_overlap=mask_overlap, + loss_mask=dict(type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=1.0), + train_cfg=dict(assigner=dict(num_classes=num_training_classes)), + test_cfg=dict(mask_thr_binary=0.5, fast_test=True)) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=True) +] + +last_transform = [ + dict(type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict(type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', + 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='Polygon2Mask', + downsample_ratio=downsample_ratio, + mask_overlap=mask_overlap), +] + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=True) +] +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform, *text_transform +] + +_train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict(type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict(type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), *last_transform +] +train_pipeline_stage2 = [*_train_pipeline_stage2, *text_transform] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco', + ann_file='lvis/lvis_v1_train_base.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32)), + class_text_path='data/texts/lvis_v1_base_class_texts.json', + pipeline=train_pipeline) +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0), + 'neck': + dict(lr_mult=0.0), + 'head.head_module.reg_preds': + dict(lr_mult=0.0), + 'head.head_module.cls_preds': + dict(lr_mult=0.0), + 'head.head_module.cls_contrasts': + dict(lr_mult=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_val.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_val.json', + metric=['bbox', 'segm']) +test_evaluator = val_evaluator +find_unused_parameters = True diff --git a/models/YOLO-World/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py b/models/YOLO-World/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..062c9e31ed02a1ab84a68f59ca1e5f86a389a2d6 --- /dev/null +++ b/models/YOLO-World/configs/segmentation/yolo_world_v2_seg_l_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py @@ -0,0 +1,239 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py' +) +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 + +weight_decay = 0.05 +train_batch_size_per_gpu = 8 +load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth' +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# Polygon2Mask +downsample_ratio = 4 +mask_overlap = False +use_mask2refine = True +max_aspect_ratio = 100 +min_area_ratio = 0.01 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + frozen_stages=4, # frozen the image backbone + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + freeze_all=True, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldSegHead', + head_module=dict(type='YOLOWorldSegHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes, + mask_channels=32, + proto_channels=256, + freeze_bbox=True), + mask_overlap=mask_overlap, + loss_mask=dict(type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=1.0), + train_cfg=dict(assigner=dict( + type='YOLOWorldSegAssigner', + num_classes=num_training_classes)), + test_cfg=dict(mask_thr_binary=0.5, fast_test=True)) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=True) +] + +last_transform = [ + dict(type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict(type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', + 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='Polygon2Mask', + downsample_ratio=downsample_ratio, + mask_overlap=mask_overlap), +] + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=True) +] +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform, *text_transform +] + +_train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict(type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict(type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), *last_transform +] +train_pipeline_stage2 = [*_train_pipeline_stage2, *text_transform] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco', + ann_file='lvis/lvis_v1_train_base.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32)), + class_text_path='data/texts/lvis_v1_base_class_texts.json', + pipeline=train_pipeline) +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0), + 'neck': + dict(lr_mult=0.0), + 'head.head_module.reg_preds': + dict(lr_mult=0.0), + 'head.head_module.cls_preds': + dict(lr_mult=0.0), + 'head.head_module.cls_contrasts': + dict(lr_mult=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_val.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_val.json', + metric=['bbox', 'segm']) +test_evaluator = val_evaluator +find_unused_parameters = True diff --git a/models/YOLO-World/configs/segmentation/yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py b/models/YOLO-World/configs/segmentation/yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..d196d4ee1956d8c94bfcef1ad6da10f6b9af39b8 --- /dev/null +++ b/models/YOLO-World/configs/segmentation/yolo_world_v2_seg_m_vlpan_bn_2e-4_80e_8gpus_seghead_finetune_lvis.py @@ -0,0 +1,239 @@ +_base_ = ( + '../../third_party/mmyolo/configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py' +) +custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False) +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 + +weight_decay = 0.05 +train_batch_size_per_gpu = 8 +load_from = 'pretrained_models/yolo_world_m_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_train-c6237d5b.pth' +text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +# text_model_name = 'openai/clip-vit-base-patch32' +persistent_workers = False + +# Polygon2Mask +downsample_ratio = 4 +mask_overlap = False +use_mask2refine = True +max_aspect_ratio = 100 +min_area_ratio = 0.01 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + frozen_stages=4, # frozen the image backbone + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name=text_model_name, + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + freeze_all=True, + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldSegHead', + head_module=dict(type='YOLOWorldSegHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes, + mask_channels=32, + proto_channels=256, + freeze_bbox=True), + mask_overlap=mask_overlap, + loss_mask=dict(type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=1.0), + train_cfg=dict(assigner=dict( + type='YOLOWorldSegAssigner', + num_classes=num_training_classes)), + test_cfg=dict(mask_thr_binary=0.5, fast_test=True)) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=True) +] + +last_transform = [ + dict(type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict(type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', + 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='Polygon2Mask', + downsample_ratio=downsample_ratio, + mask_overlap=mask_overlap), +] + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=True) +] +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict(type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform, *text_transform +] + +_train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict(type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict(type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), *last_transform +] +train_pipeline_stage2 = [*_train_pipeline_stage2, *text_transform] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco', + ann_file='lvis/lvis_v1_train_base.json', + data_prefix=dict(img=''), + filter_cfg=dict(filter_empty_gt=True, min_size=32)), + class_text_path='data/texts/lvis_v1_base_class_texts.json', + pipeline=train_pipeline) +train_dataloader = dict(persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] + +# training settings +default_hooks = dict(param_scheduler=dict(scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict(max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0), + 'neck': + dict(lr_mult=0.0), + 'head.head_module.reg_preds': + dict(lr_mult=0.0), + 'head.head_module.cls_preds': + dict(lr_mult=0.0), + 'head.head_module.cls_contrasts': + dict(lr_mult=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_val.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_val.json', + metric=['bbox', 'segm']) +test_evaluator = val_evaluator +find_unused_parameters = True diff --git a/models/YOLO-World/data/texts/coco_class_texts.json b/models/YOLO-World/data/texts/coco_class_texts.json new file mode 100644 index 0000000000000000000000000000000000000000..b83ee71a04c5d2606793ea9e271a8422ca762ed5 --- /dev/null +++ b/models/YOLO-World/data/texts/coco_class_texts.json @@ -0,0 +1 @@ +[["person"], ["bicycle"], ["car"], ["motorcycle"], ["airplane"], ["bus"], ["train"], ["truck"], ["boat"], ["traffic light"], ["fire hydrant"], ["stop sign"], ["parking meter"], ["bench"], ["bird"], ["cat"], ["dog"], ["horse"], ["sheep"], ["cow"], ["elephant"], ["bear"], ["zebra"], ["giraffe"], ["backpack"], ["umbrella"], ["handbag"], ["tie"], ["suitcase"], ["frisbee"], ["skis"], ["snowboard"], ["sports ball"], ["kite"], ["baseball bat"], ["baseball glove"], ["skateboard"], ["surfboard"], ["tennis racket"], ["bottle"], ["wine glass"], ["cup"], ["fork"], ["knife"], ["spoon"], ["bowl"], ["banana"], ["apple"], ["sandwich"], ["orange"], ["broccoli"], ["carrot"], ["hot dog"], ["pizza"], ["donut"], ["cake"], ["chair"], ["couch"], ["potted plant"], ["bed"], ["dining table"], ["toilet"], ["tv"], ["laptop"], ["mouse"], ["remote"], ["keyboard"], ["cell phone"], ["microwave"], ["oven"], ["toaster"], ["sink"], ["refrigerator"], ["book"], ["clock"], ["vase"], ["scissors"], ["teddy bear"], ["hair drier"], ["toothbrush"]] diff --git a/models/YOLO-World/data/texts/lvis_v1_base_class_captions.json b/models/YOLO-World/data/texts/lvis_v1_base_class_captions.json new file mode 100644 index 0000000000000000000000000000000000000000..27e5e72636076fccdbbe7a93ffb56d2d8bbe0a3f --- /dev/null +++ b/models/YOLO-World/data/texts/lvis_v1_base_class_captions.json @@ -0,0 +1 @@ +[["aerosol can", "spray can"], ["air conditioner"], ["airplane", "aeroplane"], ["alarm clock"], ["alcohol", "alcoholic beverage"], ["alligator", "gator"], ["almond"], ["ambulance"], ["amplifier"], ["anklet", "ankle bracelet"], ["antenna", "aerial", "transmitting aerial"], ["apple"], ["apron"], ["aquarium", "fish tank"], ["armband"], ["armchair"], ["artichoke"], ["trash can", "garbage can", "wastebin", "dustbin", "trash barrel", "trash bin"], ["ashtray"], ["asparagus"], ["atomizer", "atomiser", "spray", "sprayer", "nebulizer", "nebuliser"], ["avocado"], ["award", "accolade"], ["awning"], ["baby buggy", "baby carriage", "perambulator", "pram", "stroller"], ["basketball backboard"], ["backpack", "knapsack", "packsack", "rucksack", "haversack"], ["handbag", "purse", "pocketbook"], ["suitcase", "baggage", "luggage"], ["bagel", "beigel"], ["ball"], ["balloon"], ["bamboo"], ["banana"], ["Band Aid"], ["bandage"], ["bandanna", "bandana"], ["banner", "streamer"], ["barrel", "cask"], ["barrette"], ["barrow", "garden cart", "lawn cart", "wheelbarrow"], ["baseball base"], ["baseball"], ["baseball bat"], ["baseball cap", "jockey cap", "golf cap"], ["baseball glove", "baseball mitt"], ["basket", "handbasket"], ["basketball"], ["bat", "bat animal"], ["bath mat"], ["bath towel"], ["bathrobe"], ["bathtub", "bathing tub"], ["battery"], ["bead"], ["bean curd", "tofu"], ["beanbag"], ["beanie", "beany"], ["bear"], ["bed"], ["bedspread", "bedcover", "bed covering", "counterpane", "spread"], ["cow"], ["beef", "beef food", "boeuf", "boeuf food"], ["beer bottle"], ["beer can"], ["bell"], ["bell pepper", "capsicum"], ["belt"], ["belt buckle"], ["bench"], ["beret"], ["bib"], ["bicycle", "bike", "bike bicycle"], ["visor", "vizor"], ["billboard"], ["binder", "ring-binder"], ["binoculars", "field glasses", "opera glasses"], ["bird"], ["birdfeeder"], ["birdbath"], ["birdcage"], ["birdhouse"], ["birthday cake"], ["black sheep"], ["blackberry"], ["blackboard", "chalkboard"], ["blanket"], ["blazer", "sport jacket", "sport coat", "sports jacket", "sports coat"], ["blender", "liquidizer", "liquidiser"], ["blinker", "flasher"], ["blouse"], ["blueberry"], ["boat", "ship", "ship boat"], ["bobbin", "spool", "reel"], ["bobby pin", "hairgrip"], ["boiled egg", "coddled egg"], ["deadbolt"], ["bolt"], ["book"], ["bookcase"], ["booklet", "brochure", "leaflet", "pamphlet"], ["boot"], ["bottle"], ["bottle opener"], ["bouquet"], ["bow", "bow decorative ribbons"], ["bow-tie", "bowtie"], ["bowl"], ["bowler hat", "bowler", "derby hat", "derby", "plug hat"], ["box"], ["suspenders"], ["bracelet", "bangle"], ["brassiere", "bra", "bandeau"], ["bread-bin", "breadbox"], ["bread"], ["bridal gown", "wedding gown", "wedding dress"], ["briefcase"], ["broccoli"], ["broom"], ["brownie"], ["brussels sprouts"], ["bucket", "pail"], ["horned cow"], ["bulldog"], ["bullet train"], ["bulletin board", "notice board"], ["bullhorn", "megaphone"], ["bun", "roll"], ["bunk bed"], ["buoy"], ["bus", "bus vehicle", "autobus", "charabanc", "double-decker", "motorbus", "motorcoach"], ["business card"], ["butter"], ["butterfly"], ["button"], ["cab", "cab taxi", "taxi", "taxicab"], ["cabin car", "caboose"], ["cabinet"], ["cake"], ["calculator"], ["calendar"], ["calf"], ["camcorder"], ["camel"], ["camera"], ["camera lens"], ["camper", "camper vehicle", "camping bus", "motor home"], ["can", "tin can"], ["can opener", "tin opener"], ["candle", "candlestick"], ["candle holder"], ["candy cane"], ["walking cane"], ["canister", "cannister"], ["canoe"], ["cantaloup", "cantaloupe"], ["cap", "cap headwear"], ["bottle cap", "cap", "cap container lid"], ["cape"], ["cappuccino", "coffee cappuccino"], ["car", "car automobile", "auto", "auto automobile", "automobile"], ["railcar", "railcar part of a train", "railway car", "railway car part of a train", "railroad car", "railroad car part of a train"], ["identity card"], ["card"], ["cardigan"], ["horse carriage"], ["carrot"], ["tote bag"], ["cart"], ["carton"], ["cash register", "register", "register for cash transactions"], ["cast", "plaster cast", "plaster bandage"], ["cat"], ["cauliflower"], ["cayenne", "cayenne spice", "cayenne pepper", "cayenne pepper spice", "red pepper", "red pepper spice"], ["CD player"], ["celery"], ["cellular telephone", "cellular phone", "cellphone", "mobile phone", "smart phone"], ["chair"], ["chandelier"], ["cherry"], ["chicken", "chicken animal"], ["chickpea", "garbanzo"], ["chili", "chili vegetable", "chili pepper", "chili pepper vegetable", "chilli", "chilli vegetable", "chilly", "chilly vegetable", "chile", "chile vegetable"], ["crisp", "crisp potato chip", "potato chip"], ["chocolate bar"], ["chocolate cake"], ["choker", "collar", "neckband"], ["chopping board", "cutting board", "chopping block"], ["chopstick"], ["Christmas tree"], ["slide"], ["cigarette"], ["cigarette case", "cigarette pack"], ["cistern", "water tank"], ["clasp"], ["cleansing agent", "cleanser", "cleaner"], ["clip"], ["clipboard"], ["clock", "timepiece", "timekeeper"], ["clock tower"], ["clothes hamper", "laundry basket", "clothes basket"], ["clothespin", "clothes peg"], ["coaster"], ["coat"], ["coat hanger", "clothes hanger", "dress hanger"], ["coatrack", "hatrack"], ["cock", "rooster"], ["coconut", "cocoanut"], ["coffee maker", "coffee machine"], ["coffee table", "cocktail table"], ["coffeepot"], ["coin"], ["colander", "cullender"], ["coleslaw", "slaw"], ["pacifier", "teething ring"], ["computer keyboard", "keyboard", "keyboard computer"], ["condiment"], ["cone", "traffic cone"], ["control", "controller"], ["cookie", "cooky", "biscuit", "biscuit cookie"], ["cooler", "cooler for food", "ice chest"], ["cork", "cork bottle plug", "bottle cork"], ["corkscrew", "bottle screw"], ["edible corn", "corn", "maize"], ["cornet", "horn", "trumpet"], ["cornice", "valance", "valance board", "pelmet"], ["corset", "girdle"], ["costume"], ["cowbell"], ["cowboy hat", "ten-gallon hat"], ["crab", "crab animal"], ["cracker"], ["crate"], ["crayon", "wax crayon"], ["crescent roll", "croissant"], ["crib", "cot"], ["crock pot", "earthenware jar"], ["crossbar"], ["crow"], ["crown"], ["crucifix"], ["cruise ship", "cruise liner"], ["police cruiser", "patrol car", "police car", "squad car"], ["crumb"], ["crutch"], ["cub", "cub animal"], ["cube", "square block"], ["cucumber", "cuke"], ["cufflink"], ["cup"], ["trophy cup"], ["cupboard", "closet"], ["cupcake"], ["curtain", "drapery"], ["cushion"], ["dartboard"], ["deck chair", "beach chair"], ["deer", "cervid"], ["dental floss", "floss"], ["desk"], ["diaper"], ["dining table"], ["dish"], ["dish antenna"], ["dishrag", "dishcloth"], ["dishtowel", "tea towel"], ["dishwasher", "dishwashing machine"], ["dispenser"], ["Dixie cup", "paper cup"], ["dog"], ["dog collar"], ["doll"], ["dolphin"], ["domestic ass", "donkey"], ["doorknob", "doorhandle"], ["doormat", "welcome mat"], ["doughnut", "donut"], ["drawer"], ["underdrawers", "boxers", "boxershorts"], ["dress", "frock"], ["dress hat", "high hat", "opera hat", "silk hat", "top hat"], ["dress suit"], ["dresser"], ["drill"], ["drum", "drum musical instrument"], ["duck"], ["duckling"], ["duct tape"], ["duffel bag", "duffle bag", "duffel", "duffle"], ["dumpster"], ["eagle"], ["earphone", "earpiece", "headphone"], ["earring"], ["easel"], ["egg", "eggs"], ["egg yolk", "yolk", "yolk egg"], ["eggbeater", "eggwhisk"], ["eggplant", "aubergine"], ["refrigerator"], ["elephant"], ["elk", "moose"], ["envelope"], ["eraser"], ["fan"], ["faucet", "spigot", "tap"], ["Ferris wheel"], ["ferry", "ferryboat"], ["fighter jet", "fighter aircraft", "attack aircraft"], ["figurine"], ["file cabinet", "filing cabinet"], ["fire alarm", "smoke alarm"], ["fire engine", "fire truck"], ["fire extinguisher", "extinguisher"], ["fire hose"], ["fireplace"], ["fireplug", "fire hydrant", "hydrant"], ["fish"], ["fish", "fish food"], ["fishing rod", "fishing pole"], ["flag"], ["flagpole", "flagstaff"], ["flamingo"], ["flannel"], ["flap"], ["flashlight", "torch"], ["flip-flop", "flip-flop sandal"], ["flipper", "flipper footwear", "fin", "fin footwear"], ["flower arrangement", "floral arrangement"], ["flute glass", "champagne flute"], ["foal"], ["folding chair"], ["food processor"], ["football", "football American"], ["footstool", "footrest"], ["fork"], ["forklift"], ["freight car"], ["French toast"], ["freshener", "air freshener"], ["frisbee"], ["frog", "toad", "toad frog"], ["fruit juice"], ["frying pan", "frypan", "skillet"], ["garbage truck"], ["garden hose"], ["gargle", "mouthwash"], ["garlic", "ail"], ["gazelle"], ["gelatin", "jelly"], ["giant panda", "panda", "panda bear"], ["gift wrap"], ["ginger", "gingerroot"], ["giraffe"], ["cincture", "sash", "waistband", "waistcloth"], ["glass", "glass drink container", "drinking glass"], ["globe"], ["glove"], ["goat"], ["goggles"], ["golf club", "golf-club"], ["golfcart"], ["goose"], ["grape"], ["grater"], ["gravestone", "headstone", "tombstone"], ["green bean"], ["green onion", "spring onion", "scallion"], ["grill", "grille", "grillwork", "radiator grille"], ["grizzly", "grizzly bear"], ["grocery bag"], ["guitar"], ["gull", "seagull"], ["gun"], ["hairbrush"], ["hairnet"], ["hairpin"], ["ham", "jambon", "gammon"], ["hamburger", "beefburger", "burger"], ["hammer"], ["hammock"], ["hamster"], ["hair dryer"], ["hand towel", "face towel"], ["handcart", "pushcart", "hand truck"], ["handkerchief"], ["handle", "grip", "handgrip"], ["hat"], ["veil"], ["headband"], ["headboard"], ["headlight", "headlamp"], ["headscarf"], ["headstall", "headstall for horses", "headpiece", "headpiece for horses"], ["heart"], ["heater", "warmer"], ["helicopter"], ["helmet"], ["highchair", "feeding chair"], ["hinge"], ["hog", "pig"], ["home plate", "home plate baseball", "home base", "home base baseball"], ["honey"], ["fume hood", "exhaust hood"], ["hook"], ["horse"], ["hose", "hosepipe"], ["hot sauce"], ["hummingbird"], ["polar bear"], ["icecream"], ["ice maker"], ["igniter", "ignitor", "lighter"], ["iPod"], ["iron", "iron for clothing", "smoothing iron", "smoothing iron for clothing"], ["ironing board"], ["jacket"], ["jam"], ["jar"], ["jean", "blue jean", "denim"], ["jeep", "landrover"], ["jersey", "T-shirt", "tee shirt"], ["jet plane", "jet-propelled plane"], ["jewelry", "jewellery"], ["jumpsuit"], ["kayak"], ["kettle", "boiler"], ["key"], ["kilt"], ["kimono"], ["kitchen sink"], ["kite"], ["kitten", "kitty"], ["kiwi fruit"], ["knee pad"], ["knife"], ["knob"], ["ladder"], ["ladle"], ["ladybug", "ladybeetle", "ladybird beetle"], ["lamb", "lamb animal"], ["lamp"], ["lamppost"], ["lampshade"], ["lantern"], ["lanyard", "laniard"], ["laptop computer", "notebook computer"], ["latch"], ["legging", "legging clothing", "leging", "leging clothing", "leg covering"], ["Lego", "Lego set"], ["lemon"], ["lettuce"], ["license plate", "numberplate"], ["life buoy", "lifesaver", "life belt", "life ring"], ["life jacket", "life vest"], ["lightbulb"], ["lime"], ["lion"], ["lip balm"], ["lizard"], ["log"], ["lollipop"], ["speaker", "speaker stereo equipment"], ["loveseat"], ["magazine"], ["magnet"], ["mail slot"], ["mailbox", "mailbox at home", "letter box", "letter box at home"], ["mandarin orange"], ["manger", "trough"], ["manhole"], ["map"], ["marker"], ["mashed potato"], ["mask", "facemask"], ["mast"], ["mat", "mat gym equipment", "gym mat"], ["mattress"], ["measuring cup"], ["measuring stick", "ruler", "ruler measuring stick", "measuring rod"], ["meatball"], ["medicine"], ["melon"], ["microphone"], ["microwave oven"], ["milk"], ["minivan"], ["mirror"], ["mitten"], ["mixer", "mixer kitchen tool", "stand mixer"], ["money"], ["monitor", "monitor computer equipment"], ["monkey"], ["motor"], ["motor scooter", "scooter"], ["motorcycle"], ["mound", "mound baseball", "pitcher's mound"], ["mouse", "mouse computer equipment", "computer mouse"], ["mousepad"], ["muffin"], ["mug"], ["mushroom"], ["musical instrument", "instrument", "instrument musical"], ["napkin", "table napkin", "serviette"], ["necklace"], ["necktie", "tie", "tie necktie"], ["needle"], ["nest"], ["newspaper", "paper", "paper newspaper"], ["newsstand"], ["nightshirt", "nightwear", "sleepwear", "nightclothes"], ["noseband", "noseband for animals", "nosepiece", "nosepiece for animals"], ["notebook"], ["notepad"], ["nut"], ["oar"], ["oil lamp", "kerosene lamp", "kerosine lamp"], ["olive oil"], ["onion"], ["orange", "orange fruit"], ["orange juice"], ["ostrich"], ["ottoman", "pouf", "pouffe", "hassock"], ["oven"], ["overalls", "overalls clothing"], ["owl"], ["packet"], ["pad"], ["paddle", "boat paddle"], ["padlock"], ["paintbrush"], ["painting"], ["pajamas", "pyjamas"], ["palette", "pallet"], ["pan", "pan for cooking", "cooking pan"], ["pancake"], ["paper plate"], ["paper towel"], ["parachute"], ["parakeet", "parrakeet", "parroket", "paraquet", "paroquet", "parroquet"], ["parasail", "parasail sports"], ["parasol", "sunshade"], ["parka", "anorak"], ["parking meter"], ["parrot"], ["passenger car", "passenger car part of a train", "coach", "coach part of a train"], ["passport"], ["pastry"], ["pea", "pea food"], ["peach"], ["peanut butter"], ["pear"], ["peeler", "peeler tool for fruit and vegetables"], ["pelican"], ["pen"], ["pencil"], ["penguin"], ["pepper", "peppercorn"], ["pepper mill", "pepper grinder"], ["perfume"], ["person", "baby", "child", "boy", "girl", "man", "woman", "human"], ["pet"], ["pew", "pew church bench", "church bench"], ["phonograph record", "phonograph recording", "record", "record phonograph recording"], ["piano"], ["pickle"], ["pickup truck"], ["pie"], ["pigeon"], ["pillow"], ["pineapple"], ["pinecone"], ["pipe", "piping"], ["pita", "pita bread", "pocket bread"], ["pitcher", "pitcher vessel for liquid", "ewer"], ["pizza"], ["place mat"], ["plate"], ["platter"], ["pliers", "plyers"], ["pocketknife"], ["poker", "poker fire stirring tool", "stove poker", "fire hook"], ["pole", "post"], ["polo shirt", "sport shirt"], ["pony"], ["pop", "pop soda", "soda", "soda pop", "tonic", "soft drink"], ["postbox", "postbox public", "mailbox", "mailbox public"], ["postcard", "postal card", "mailing-card"], ["poster", "placard"], ["pot"], ["flowerpot"], ["potato"], ["potholder"], ["pottery", "clayware"], ["pouch"], ["power shovel", "excavator", "digger"], ["prawn", "shrimp"], ["pretzel"], ["printer", "printing machine"], ["projectile", "projectile weapon", "missile"], ["projector"], ["propeller", "propellor"], ["pumpkin"], ["puppy"], ["quilt", "comforter"], ["rabbit"], ["racket", "racquet"], ["radiator"], ["radio receiver", "radio set", "radio", "tuner", "tuner radio"], ["radish", "daikon"], ["raft"], ["raincoat", "waterproof jacket"], ["ram", "ram animal"], ["raspberry"], ["razorblade"], ["reamer", "reamer juicer", "juicer", "juice reamer"], ["rearview mirror"], ["receipt"], ["recliner", "reclining chair", "lounger", "lounger chair"], ["record player", "phonograph", "phonograph record player", "turntable"], ["reflector"], ["remote control"], ["rhinoceros"], ["rifle"], ["ring"], ["robe"], ["rocking chair"], ["rolling pin"], ["router", "router computer equipment"], ["rubber band", "elastic band"], ["runner", "runner carpet"], ["plastic bag", "paper bag"], ["saddle", "saddle on an animal"], ["saddle blanket", "saddlecloth", "horse blanket"], ["saddlebag"], ["sail"], ["salad"], ["salami"], ["salmon", "salmon fish"], ["salsa"], ["saltshaker"], ["sandal", "sandal type of shoe"], ["sandwich"], ["saucer"], ["sausage"], ["scale", "scale measuring instrument"], ["scarf"], ["school bus"], ["scissors"], ["scoreboard"], ["screwdriver"], ["scrubbing brush"], ["sculpture"], ["seabird", "seafowl"], ["seahorse"], ["seashell"], ["sewing machine"], ["shaker"], ["shampoo"], ["shark"], ["shaving cream", "shaving soap"], ["sheep"], ["shield"], ["shirt"], ["shoe", "sneaker", "sneaker type of shoe", "tennis shoe"], ["shopping bag"], ["shopping cart"], ["short pants", "shorts", "shorts clothing", "trunks", "trunks clothing"], ["shoulder bag"], ["shovel"], ["shower head"], ["shower curtain"], ["signboard"], ["silo"], ["sink"], ["skateboard"], ["skewer"], ["ski"], ["ski boot"], ["ski parka", "ski jacket"], ["ski pole"], ["skirt"], ["sled", "sledge", "sleigh"], ["sleeping bag"], ["slipper", "slipper footwear", "carpet slipper", "carpet slipper footwear"], ["snowboard"], ["snowman"], ["snowmobile"], ["soap"], ["soccer ball"], ["sock"], ["sofa", "couch", "lounge"], ["solar array", "solar battery", "solar panel"], ["soup"], ["soupspoon"], ["sour cream", "soured cream"], ["spatula"], ["spectacles", "specs", "eyeglasses", "glasses"], ["spice rack"], ["spider"], ["sponge"], ["spoon"], ["sportswear", "athletic wear", "activewear"], ["spotlight"], ["squirrel"], ["stapler", "stapler stapling machine"], ["starfish", "sea star"], ["statue", "statue sculpture"], ["steak", "steak food"], ["steering wheel"], ["step stool"], ["stereo", "stereo sound system"], ["stirrup"], ["stool"], ["stop sign"], ["brake light"], ["stove", "kitchen stove", "range", "range kitchen appliance", "kitchen range", "cooking stove"], ["strainer"], ["strap"], ["straw", "straw for drinking", "drinking straw"], ["strawberry"], ["street sign"], ["streetlight", "street lamp"], ["suit", "suit clothing"], ["sunflower"], ["sunglasses"], ["sunhat"], ["surfboard"], ["sushi"], ["mop"], ["sweat pants"], ["sweatband"], ["sweater"], ["sweatshirt"], ["sweet potato"], ["swimsuit", "swimwear", "bathing suit", "swimming costume", "bathing costume", "swimming trunks", "bathing trunks"], ["sword"], ["table"], ["table lamp"], ["tablecloth"], ["tag"], ["taillight", "rear light"], ["tank", "tank storage vessel", "storage tank"], ["tank top", "tank top clothing"], ["tape", "tape sticky cloth or paper"], ["tape measure", "measuring tape"], ["tapestry"], ["tarp"], ["tartan", "plaid"], ["tassel"], ["tea bag"], ["teacup"], ["teakettle"], ["teapot"], ["teddy bear"], ["telephone", "phone", "telephone set"], ["telephone booth", "phone booth", "call box", "telephone box", "telephone kiosk"], ["telephone pole", "telegraph pole", "telegraph post"], ["television camera", "tv camera"], ["television set", "tv", "tv set"], ["tennis ball"], ["tennis racket"], ["thermometer"], ["thermos bottle"], ["thermostat"], ["thread", "yarn"], ["thumbtack", "drawing pin", "pushpin"], ["tiara"], ["tiger"], ["tights", "tights clothing", "leotards"], ["timer", "stopwatch"], ["tinfoil"], ["tinsel"], ["tissue paper"], ["toast", "toast food"], ["toaster"], ["toaster oven"], ["toilet"], ["toilet tissue", "toilet paper", "bathroom tissue"], ["tomato"], ["tongs"], ["toolbox"], ["toothbrush"], ["toothpaste"], ["toothpick"], ["cover"], ["tortilla"], ["tow truck"], ["towel"], ["towel rack", "towel rail", "towel bar"], ["toy"], ["tractor", "tractor farm equipment"], ["traffic light"], ["dirt bike"], ["trailer truck", "tractor trailer", "trucking rig", "articulated lorry", "semi truck"], ["train", "train railroad vehicle", "railroad train"], ["tray"], ["tricycle"], ["tripod"], ["trousers", "pants", "pants clothing"], ["truck"], ["trunk"], ["turban"], ["turkey", "turkey food"], ["turtle"], ["turtleneck", "turtleneck clothing", "polo-neck"], ["typewriter"], ["umbrella"], ["underwear", "underclothes", "underclothing", "underpants"], ["urinal"], ["urn"], ["vacuum cleaner"], ["vase"], ["vending machine"], ["vent", "blowhole", "air vent"], ["vest", "waistcoat"], ["videotape"], ["volleyball"], ["waffle"], ["wagon"], ["wagon wheel"], ["walking stick"], ["wall clock"], ["wall socket", "wall plug", "electric outlet", "electrical outlet", "outlet", "electric receptacle"], ["wallet", "billfold"], ["automatic washer", "washing machine"], ["watch", "wristwatch"], ["water bottle"], ["water cooler"], ["water faucet", "water tap", "tap", "tap water faucet"], ["water jug"], ["water scooter", "sea scooter", "jet ski"], ["water ski"], ["water tower"], ["watering can"], ["watermelon"], ["weathervane", "vane", "vane weathervane", "wind vane"], ["webcam"], ["wedding cake", "bridecake"], ["wedding ring", "wedding band"], ["wet suit"], ["wheel"], ["wheelchair"], ["whipped cream"], ["whistle"], ["wig"], ["wind chime"], ["windmill"], ["window box", "window box for plants"], ["windshield wiper", "windscreen wiper", "wiper", "wiper for windshield or screen"], ["windsock", "air sock", "air-sleeve", "wind sleeve", "wind cone"], ["wine bottle"], ["wine bucket", "wine cooler"], ["wineglass"], ["blinder", "blinder for horses"], ["wok"], ["wooden spoon"], ["wreath"], ["wrench", "spanner"], ["wristband"], ["wristlet", "wrist band"], ["yacht"], ["yogurt", "yoghurt", "yoghourt"], ["yoke", "yoke animal equipment"], ["zebra"], ["zucchini", "courgette"]] \ No newline at end of file diff --git a/models/YOLO-World/data/texts/lvis_v1_class_texts.json b/models/YOLO-World/data/texts/lvis_v1_class_texts.json new file mode 100644 index 0000000000000000000000000000000000000000..367aaf5430da14c914503b46e4a91bd1542849dd --- /dev/null +++ b/models/YOLO-World/data/texts/lvis_v1_class_texts.json @@ -0,0 +1 @@ +[["aerosol can", "spray can"], ["air conditioner"], ["airplane", "aeroplane"], ["alarm clock"], ["alcohol", "alcoholic beverage"], ["alligator", "gator"], ["almond"], ["ambulance"], ["amplifier"], ["anklet", "ankle bracelet"], ["antenna", "aerial", "transmitting aerial"], ["apple"], ["applesauce"], ["apricot"], ["apron"], ["aquarium", "fish tank"], ["arctic", "arctic type of shoe", "galosh", "golosh", "rubber", "rubber type of shoe", "gumshoe"], ["armband"], ["armchair"], ["armoire"], ["armor", "armour"], ["artichoke"], ["trash can", "garbage can", "wastebin", "dustbin", "trash barrel", "trash bin"], ["ashtray"], ["asparagus"], ["atomizer", "atomiser", "spray", "sprayer", "nebulizer", "nebuliser"], ["avocado"], ["award", "accolade"], ["awning"], ["ax", "axe"], ["baboon"], ["baby buggy", "baby carriage", "perambulator", "pram", "stroller"], ["basketball backboard"], ["backpack", "knapsack", "packsack", "rucksack", "haversack"], ["handbag", "purse", "pocketbook"], ["suitcase", "baggage", "luggage"], ["bagel", "beigel"], ["bagpipe"], ["baguet", "baguette"], ["bait", "lure"], ["ball"], ["ballet skirt", "tutu"], ["balloon"], ["bamboo"], ["banana"], ["Band Aid"], ["bandage"], ["bandanna", "bandana"], ["banjo"], ["banner", "streamer"], ["barbell"], ["barge"], ["barrel", "cask"], ["barrette"], ["barrow", "garden cart", "lawn cart", "wheelbarrow"], ["baseball base"], ["baseball"], ["baseball bat"], ["baseball cap", "jockey cap", "golf cap"], ["baseball glove", "baseball mitt"], ["basket", "handbasket"], ["basketball"], ["bass horn", "sousaphone", "tuba"], ["bat", "bat animal"], ["bath mat"], ["bath towel"], ["bathrobe"], ["bathtub", "bathing tub"], ["batter", "batter food"], ["battery"], ["beachball"], ["bead"], ["bean curd", "tofu"], ["beanbag"], ["beanie", "beany"], ["bear"], ["bed"], ["bedpan"], ["bedspread", "bedcover", "bed covering", "counterpane", "spread"], ["cow"], ["beef", "beef food", "boeuf", "boeuf food"], ["beeper", "pager"], ["beer bottle"], ["beer can"], ["beetle"], ["bell"], ["bell pepper", "capsicum"], ["belt"], ["belt buckle"], ["bench"], ["beret"], ["bib"], ["Bible"], ["bicycle", "bike", "bike bicycle"], ["visor", "vizor"], ["billboard"], ["binder", "ring-binder"], ["binoculars", "field glasses", "opera glasses"], ["bird"], ["birdfeeder"], ["birdbath"], ["birdcage"], ["birdhouse"], ["birthday cake"], ["birthday card"], ["pirate flag"], ["black sheep"], ["blackberry"], ["blackboard", "chalkboard"], ["blanket"], ["blazer", "sport jacket", "sport coat", "sports jacket", "sports coat"], ["blender", "liquidizer", "liquidiser"], ["blimp"], ["blinker", "flasher"], ["blouse"], ["blueberry"], ["gameboard"], ["boat", "ship", "ship boat"], ["bob", "bobber", "bobfloat"], ["bobbin", "spool", "reel"], ["bobby pin", "hairgrip"], ["boiled egg", "coddled egg"], ["bolo tie", "bolo", "bola tie", "bola"], ["deadbolt"], ["bolt"], ["bonnet"], ["book"], ["bookcase"], ["booklet", "brochure", "leaflet", "pamphlet"], ["bookmark", "bookmarker"], ["boom microphone", "microphone boom"], ["boot"], ["bottle"], ["bottle opener"], ["bouquet"], ["bow", "bow weapon"], ["bow", "bow decorative ribbons"], ["bow-tie", "bowtie"], ["bowl"], ["pipe bowl"], ["bowler hat", "bowler", "derby hat", "derby", "plug hat"], ["bowling ball"], ["box"], ["boxing glove"], ["suspenders"], ["bracelet", "bangle"], ["brass plaque"], ["brassiere", "bra", "bandeau"], ["bread-bin", "breadbox"], ["bread"], ["breechcloth", "breechclout", "loincloth"], ["bridal gown", "wedding gown", "wedding dress"], ["briefcase"], ["broccoli"], ["broach"], ["broom"], ["brownie"], ["brussels sprouts"], ["bubble gum"], ["bucket", "pail"], ["horse buggy"], ["horned cow"], ["bulldog"], ["bulldozer", "dozer"], ["bullet train"], ["bulletin board", "notice board"], ["bulletproof vest"], ["bullhorn", "megaphone"], ["bun", "roll"], ["bunk bed"], ["buoy"], ["burrito"], ["bus", "bus vehicle", "autobus", "charabanc", "double-decker", "motorbus", "motorcoach"], ["business card"], ["butter"], ["butterfly"], ["button"], ["cab", "cab taxi", "taxi", "taxicab"], ["cabana"], ["cabin car", "caboose"], ["cabinet"], ["locker", "storage locker"], ["cake"], ["calculator"], ["calendar"], ["calf"], ["camcorder"], ["camel"], ["camera"], ["camera lens"], ["camper", "camper vehicle", "camping bus", "motor home"], ["can", "tin can"], ["can opener", "tin opener"], ["candle", "candlestick"], ["candle holder"], ["candy bar"], ["candy cane"], ["walking cane"], ["canister", "cannister"], ["canoe"], ["cantaloup", "cantaloupe"], ["canteen"], ["cap", "cap headwear"], ["bottle cap", "cap", "cap container lid"], ["cape"], ["cappuccino", "coffee cappuccino"], ["car", "car automobile", "auto", "auto automobile", "automobile"], ["railcar", "railcar part of a train", "railway car", "railway car part of a train", "railroad car", "railroad car part of a train"], ["elevator car"], ["car battery", "automobile battery"], ["identity card"], ["card"], ["cardigan"], ["cargo ship", "cargo vessel"], ["carnation"], ["horse carriage"], ["carrot"], ["tote bag"], ["cart"], ["carton"], ["cash register", "register", "register for cash transactions"], ["casserole"], ["cassette"], ["cast", "plaster cast", "plaster bandage"], ["cat"], ["cauliflower"], ["cayenne", "cayenne spice", "cayenne pepper", "cayenne pepper spice", "red pepper", "red pepper spice"], ["CD player"], ["celery"], ["cellular telephone", "cellular phone", "cellphone", "mobile phone", "smart phone"], ["chain mail", "ring mail", "chain armor", "chain armour", "ring armor", "ring armour"], ["chair"], ["chaise longue", "chaise", "daybed"], ["chalice"], ["chandelier"], ["chap"], ["checkbook", "chequebook"], ["checkerboard"], ["cherry"], ["chessboard"], ["chicken", "chicken animal"], ["chickpea", "garbanzo"], ["chili", "chili vegetable", "chili pepper", "chili pepper vegetable", "chilli", "chilli vegetable", "chilly", "chilly vegetable", "chile", "chile vegetable"], ["chime", "gong"], ["chinaware"], ["crisp", "crisp potato chip", "potato chip"], ["poker chip"], ["chocolate bar"], ["chocolate cake"], ["chocolate milk"], ["chocolate mousse"], ["choker", "collar", "neckband"], ["chopping board", "cutting board", "chopping block"], ["chopstick"], ["Christmas tree"], ["slide"], ["cider", "cyder"], ["cigar box"], ["cigarette"], ["cigarette case", "cigarette pack"], ["cistern", "water tank"], ["clarinet"], ["clasp"], ["cleansing agent", "cleanser", "cleaner"], ["cleat", "cleat for securing rope"], ["clementine"], ["clip"], ["clipboard"], ["clippers", "clippers for plants"], ["cloak"], ["clock", "timepiece", "timekeeper"], ["clock tower"], ["clothes hamper", "laundry basket", "clothes basket"], ["clothespin", "clothes peg"], ["clutch bag"], ["coaster"], ["coat"], ["coat hanger", "clothes hanger", "dress hanger"], ["coatrack", "hatrack"], ["cock", "rooster"], ["cockroach"], ["cocoa", "cocoa beverage", "hot chocolate", "hot chocolate beverage", "drinking chocolate"], ["coconut", "cocoanut"], ["coffee maker", "coffee machine"], ["coffee table", "cocktail table"], ["coffeepot"], ["coil"], ["coin"], ["colander", "cullender"], ["coleslaw", "slaw"], ["coloring material", "colouring material"], ["combination lock"], ["pacifier", "teething ring"], ["comic book"], ["compass"], ["computer keyboard", "keyboard", "keyboard computer"], ["condiment"], ["cone", "traffic cone"], ["control", "controller"], ["convertible", "convertible automobile"], ["sofa bed"], ["cooker"], ["cookie", "cooky", "biscuit", "biscuit cookie"], ["cooking utensil"], ["cooler", "cooler for food", "ice chest"], ["cork", "cork bottle plug", "bottle cork"], ["corkboard"], ["corkscrew", "bottle screw"], ["edible corn", "corn", "maize"], ["cornbread"], ["cornet", "horn", "trumpet"], ["cornice", "valance", "valance board", "pelmet"], ["cornmeal"], ["corset", "girdle"], ["costume"], ["cougar", "puma", "catamount", "mountain lion", "panther"], ["coverall"], ["cowbell"], ["cowboy hat", "ten-gallon hat"], ["crab", "crab animal"], ["crabmeat"], ["cracker"], ["crape", "crepe", "French pancake"], ["crate"], ["crayon", "wax crayon"], ["cream pitcher"], ["crescent roll", "croissant"], ["crib", "cot"], ["crock pot", "earthenware jar"], ["crossbar"], ["crouton"], ["crow"], ["crowbar", "wrecking bar", "pry bar"], ["crown"], ["crucifix"], ["cruise ship", "cruise liner"], ["police cruiser", "patrol car", "police car", "squad car"], ["crumb"], ["crutch"], ["cub", "cub animal"], ["cube", "square block"], ["cucumber", "cuke"], ["cufflink"], ["cup"], ["trophy cup"], ["cupboard", "closet"], ["cupcake"], ["hair curler", "hair roller", "hair crimper"], ["curling iron"], ["curtain", "drapery"], ["cushion"], ["cylinder"], ["cymbal"], ["dagger"], ["dalmatian"], ["dartboard"], ["date", "date fruit"], ["deck chair", "beach chair"], ["deer", "cervid"], ["dental floss", "floss"], ["desk"], ["detergent"], ["diaper"], ["diary", "journal"], ["die", "dice"], ["dinghy", "dory", "rowboat"], ["dining table"], ["tux", "tuxedo"], ["dish"], ["dish antenna"], ["dishrag", "dishcloth"], ["dishtowel", "tea towel"], ["dishwasher", "dishwashing machine"], ["dishwasher detergent", "dishwashing detergent", "dishwashing liquid", "dishsoap"], ["dispenser"], ["diving board"], ["Dixie cup", "paper cup"], ["dog"], ["dog collar"], ["doll"], ["dollar", "dollar bill", "one dollar bill"], ["dollhouse", "doll's house"], ["dolphin"], ["domestic ass", "donkey"], ["doorknob", "doorhandle"], ["doormat", "welcome mat"], ["doughnut", "donut"], ["dove"], ["dragonfly"], ["drawer"], ["underdrawers", "boxers", "boxershorts"], ["dress", "frock"], ["dress hat", "high hat", "opera hat", "silk hat", "top hat"], ["dress suit"], ["dresser"], ["drill"], ["drone"], ["dropper", "eye dropper"], ["drum", "drum musical instrument"], ["drumstick"], ["duck"], ["duckling"], ["duct tape"], ["duffel bag", "duffle bag", "duffel", "duffle"], ["dumbbell"], ["dumpster"], ["dustpan"], ["eagle"], ["earphone", "earpiece", "headphone"], ["earplug"], ["earring"], ["easel"], ["eclair"], ["eel"], ["egg", "eggs"], ["egg roll", "spring roll"], ["egg yolk", "yolk", "yolk egg"], ["eggbeater", "eggwhisk"], ["eggplant", "aubergine"], ["electric chair"], ["refrigerator"], ["elephant"], ["elk", "moose"], ["envelope"], ["eraser"], ["escargot"], ["eyepatch"], ["falcon"], ["fan"], ["faucet", "spigot", "tap"], ["fedora"], ["ferret"], ["Ferris wheel"], ["ferry", "ferryboat"], ["fig", "fig fruit"], ["fighter jet", "fighter aircraft", "attack aircraft"], ["figurine"], ["file cabinet", "filing cabinet"], ["file", "file tool"], ["fire alarm", "smoke alarm"], ["fire engine", "fire truck"], ["fire extinguisher", "extinguisher"], ["fire hose"], ["fireplace"], ["fireplug", "fire hydrant", "hydrant"], ["first-aid kit"], ["fish"], ["fish", "fish food"], ["fishbowl", "goldfish bowl"], ["fishing rod", "fishing pole"], ["flag"], ["flagpole", "flagstaff"], ["flamingo"], ["flannel"], ["flap"], ["flash", "flashbulb"], ["flashlight", "torch"], ["fleece"], ["flip-flop", "flip-flop sandal"], ["flipper", "flipper footwear", "fin", "fin footwear"], ["flower arrangement", "floral arrangement"], ["flute glass", "champagne flute"], ["foal"], ["folding chair"], ["food processor"], ["football", "football American"], ["football helmet"], ["footstool", "footrest"], ["fork"], ["forklift"], ["freight car"], ["French toast"], ["freshener", "air freshener"], ["frisbee"], ["frog", "toad", "toad frog"], ["fruit juice"], ["frying pan", "frypan", "skillet"], ["fudge"], ["funnel"], ["futon"], ["gag", "muzzle"], ["garbage"], ["garbage truck"], ["garden hose"], ["gargle", "mouthwash"], ["gargoyle"], ["garlic", "ail"], ["gasmask", "respirator", "gas helmet"], ["gazelle"], ["gelatin", "jelly"], ["gemstone"], ["generator"], ["giant panda", "panda", "panda bear"], ["gift wrap"], ["ginger", "gingerroot"], ["giraffe"], ["cincture", "sash", "waistband", "waistcloth"], ["glass", "glass drink container", "drinking glass"], ["globe"], ["glove"], ["goat"], ["goggles"], ["goldfish"], ["golf club", "golf-club"], ["golfcart"], ["gondola", "gondola boat"], ["goose"], ["gorilla"], ["gourd"], ["grape"], ["grater"], ["gravestone", "headstone", "tombstone"], ["gravy boat", "gravy holder"], ["green bean"], ["green onion", "spring onion", "scallion"], ["griddle"], ["grill", "grille", "grillwork", "radiator grille"], ["grits", "hominy grits"], ["grizzly", "grizzly bear"], ["grocery bag"], ["guitar"], ["gull", "seagull"], ["gun"], ["hairbrush"], ["hairnet"], ["hairpin"], ["halter top"], ["ham", "jambon", "gammon"], ["hamburger", "beefburger", "burger"], ["hammer"], ["hammock"], ["hamper"], ["hamster"], ["hair dryer"], ["hand glass", "hand mirror"], ["hand towel", "face towel"], ["handcart", "pushcart", "hand truck"], ["handcuff"], ["handkerchief"], ["handle", "grip", "handgrip"], ["handsaw", "carpenter's saw"], ["hardback book", "hardcover book"], ["harmonium", "organ", "organ musical instrument", "reed organ", "reed organ musical instrument"], ["hat"], ["hatbox"], ["veil"], ["headband"], ["headboard"], ["headlight", "headlamp"], ["headscarf"], ["headset"], ["headstall", "headstall for horses", "headpiece", "headpiece for horses"], ["heart"], ["heater", "warmer"], ["helicopter"], ["helmet"], ["heron"], ["highchair", "feeding chair"], ["hinge"], ["hippopotamus"], ["hockey stick"], ["hog", "pig"], ["home plate", "home plate baseball", "home base", "home base baseball"], ["honey"], ["fume hood", "exhaust hood"], ["hook"], ["hookah", "narghile", "nargileh", "sheesha", "shisha", "water pipe"], ["hornet"], ["horse"], ["hose", "hosepipe"], ["hot-air balloon"], ["hotplate"], ["hot sauce"], ["hourglass"], ["houseboat"], ["hummingbird"], ["hummus", "humus", "hommos", "hoummos", "humous"], ["polar bear"], ["icecream"], ["popsicle"], ["ice maker"], ["ice pack", "ice bag"], ["ice skate"], ["igniter", "ignitor", "lighter"], ["inhaler", "inhalator"], ["iPod"], ["iron", "iron for clothing", "smoothing iron", "smoothing iron for clothing"], ["ironing board"], ["jacket"], ["jam"], ["jar"], ["jean", "blue jean", "denim"], ["jeep", "landrover"], ["jelly bean", "jelly egg"], ["jersey", "T-shirt", "tee shirt"], ["jet plane", "jet-propelled plane"], ["jewel", "gem", "precious stone"], ["jewelry", "jewellery"], ["joystick"], ["jumpsuit"], ["kayak"], ["keg"], ["kennel", "doghouse"], ["kettle", "boiler"], ["key"], ["keycard"], ["kilt"], ["kimono"], ["kitchen sink"], ["kitchen table"], ["kite"], ["kitten", "kitty"], ["kiwi fruit"], ["knee pad"], ["knife"], ["knitting needle"], ["knob"], ["knocker", "knocker on a door", "doorknocker"], ["koala", "koala bear"], ["lab coat", "laboratory coat"], ["ladder"], ["ladle"], ["ladybug", "ladybeetle", "ladybird beetle"], ["lamb", "lamb animal"], ["lamb-chop", "lambchop"], ["lamp"], ["lamppost"], ["lampshade"], ["lantern"], ["lanyard", "laniard"], ["laptop computer", "notebook computer"], ["lasagna", "lasagne"], ["latch"], ["lawn mower"], ["leather"], ["legging", "legging clothing", "leging", "leging clothing", "leg covering"], ["Lego", "Lego set"], ["legume"], ["lemon"], ["lemonade"], ["lettuce"], ["license plate", "numberplate"], ["life buoy", "lifesaver", "life belt", "life ring"], ["life jacket", "life vest"], ["lightbulb"], ["lightning rod", "lightning conductor"], ["lime"], ["limousine"], ["lion"], ["lip balm"], ["liquor", "spirits", "hard liquor", "liqueur", "cordial"], ["lizard"], ["log"], ["lollipop"], ["speaker", "speaker stereo equipment"], ["loveseat"], ["machine gun"], ["magazine"], ["magnet"], ["mail slot"], ["mailbox", "mailbox at home", "letter box", "letter box at home"], ["mallard"], ["mallet"], ["mammoth"], ["manatee"], ["mandarin orange"], ["manger", "trough"], ["manhole"], ["map"], ["marker"], ["martini"], ["mascot"], ["mashed potato"], ["masher"], ["mask", "facemask"], ["mast"], ["mat", "mat gym equipment", "gym mat"], ["matchbox"], ["mattress"], ["measuring cup"], ["measuring stick", "ruler", "ruler measuring stick", "measuring rod"], ["meatball"], ["medicine"], ["melon"], ["microphone"], ["microscope"], ["microwave oven"], ["milestone", "milepost"], ["milk"], ["milk can"], ["milkshake"], ["minivan"], ["mint candy"], ["mirror"], ["mitten"], ["mixer", "mixer kitchen tool", "stand mixer"], ["money"], ["monitor", "monitor computer equipment"], ["monkey"], ["motor"], ["motor scooter", "scooter"], ["motor vehicle", "automotive vehicle"], ["motorcycle"], ["mound", "mound baseball", "pitcher's mound"], ["mouse", "mouse computer equipment", "computer mouse"], ["mousepad"], ["muffin"], ["mug"], ["mushroom"], ["music stool", "piano stool"], ["musical instrument", "instrument", "instrument musical"], ["nailfile"], ["napkin", "table napkin", "serviette"], ["neckerchief"], ["necklace"], ["necktie", "tie", "tie necktie"], ["needle"], ["nest"], ["newspaper", "paper", "paper newspaper"], ["newsstand"], ["nightshirt", "nightwear", "sleepwear", "nightclothes"], ["nosebag", "nosebag for animals", "feedbag"], ["noseband", "noseband for animals", "nosepiece", "nosepiece for animals"], ["notebook"], ["notepad"], ["nut"], ["nutcracker"], ["oar"], ["octopus", "octopus food"], ["octopus", "octopus animal"], ["oil lamp", "kerosene lamp", "kerosine lamp"], ["olive oil"], ["omelet", "omelette"], ["onion"], ["orange", "orange fruit"], ["orange juice"], ["ostrich"], ["ottoman", "pouf", "pouffe", "hassock"], ["oven"], ["overalls", "overalls clothing"], ["owl"], ["packet"], ["inkpad", "inking pad", "stamp pad"], ["pad"], ["paddle", "boat paddle"], ["padlock"], ["paintbrush"], ["painting"], ["pajamas", "pyjamas"], ["palette", "pallet"], ["pan", "pan for cooking", "cooking pan"], ["pan", "pan metal container"], ["pancake"], ["pantyhose"], ["papaya"], ["paper plate"], ["paper towel"], ["paperback book", "paper-back book", "softback book", "soft-cover book"], ["paperweight"], ["parachute"], ["parakeet", "parrakeet", "parroket", "paraquet", "paroquet", "parroquet"], ["parasail", "parasail sports"], ["parasol", "sunshade"], ["parchment"], ["parka", "anorak"], ["parking meter"], ["parrot"], ["passenger car", "passenger car part of a train", "coach", "coach part of a train"], ["passenger ship"], ["passport"], ["pastry"], ["patty", "patty food"], ["pea", "pea food"], ["peach"], ["peanut butter"], ["pear"], ["peeler", "peeler tool for fruit and vegetables"], ["wooden leg", "pegleg"], ["pegboard"], ["pelican"], ["pen"], ["pencil"], ["pencil box", "pencil case"], ["pencil sharpener"], ["pendulum"], ["penguin"], ["pennant"], ["penny", "penny coin"], ["pepper", "peppercorn"], ["pepper mill", "pepper grinder"], ["perfume"], ["persimmon"], ["person", "baby", "child", "boy", "girl", "man", "woman", "human"], ["pet"], ["pew", "pew church bench", "church bench"], ["phonebook", "telephone book", "telephone directory"], ["phonograph record", "phonograph recording", "record", "record phonograph recording"], ["piano"], ["pickle"], ["pickup truck"], ["pie"], ["pigeon"], ["piggy bank", "penny bank"], ["pillow"], ["pin", "pin non jewelry"], ["pineapple"], ["pinecone"], ["ping-pong ball"], ["pinwheel"], ["tobacco pipe"], ["pipe", "piping"], ["pistol", "handgun"], ["pita", "pita bread", "pocket bread"], ["pitcher", "pitcher vessel for liquid", "ewer"], ["pitchfork"], ["pizza"], ["place mat"], ["plate"], ["platter"], ["playpen"], ["pliers", "plyers"], ["plow", "plow farm equipment", "plough", "plough farm equipment"], ["plume"], ["pocket watch"], ["pocketknife"], ["poker", "poker fire stirring tool", "stove poker", "fire hook"], ["pole", "post"], ["polo shirt", "sport shirt"], ["poncho"], ["pony"], ["pool table", "billiard table", "snooker table"], ["pop", "pop soda", "soda", "soda pop", "tonic", "soft drink"], ["postbox", "postbox public", "mailbox", "mailbox public"], ["postcard", "postal card", "mailing-card"], ["poster", "placard"], ["pot"], ["flowerpot"], ["potato"], ["potholder"], ["pottery", "clayware"], ["pouch"], ["power shovel", "excavator", "digger"], ["prawn", "shrimp"], ["pretzel"], ["printer", "printing machine"], ["projectile", "projectile weapon", "missile"], ["projector"], ["propeller", "propellor"], ["prune"], ["pudding"], ["puffer", "puffer fish", "pufferfish", "blowfish", "globefish"], ["puffin"], ["pug-dog"], ["pumpkin"], ["puncher"], ["puppet", "marionette"], ["puppy"], ["quesadilla"], ["quiche"], ["quilt", "comforter"], ["rabbit"], ["race car", "racing car"], ["racket", "racquet"], ["radar"], ["radiator"], ["radio receiver", "radio set", "radio", "tuner", "tuner radio"], ["radish", "daikon"], ["raft"], ["rag doll"], ["raincoat", "waterproof jacket"], ["ram", "ram animal"], ["raspberry"], ["rat"], ["razorblade"], ["reamer", "reamer juicer", "juicer", "juice reamer"], ["rearview mirror"], ["receipt"], ["recliner", "reclining chair", "lounger", "lounger chair"], ["record player", "phonograph", "phonograph record player", "turntable"], ["reflector"], ["remote control"], ["rhinoceros"], ["rib", "rib food"], ["rifle"], ["ring"], ["river boat"], ["road map"], ["robe"], ["rocking chair"], ["rodent"], ["roller skate"], ["Rollerblade"], ["rolling pin"], ["root beer"], ["router", "router computer equipment"], ["rubber band", "elastic band"], ["runner", "runner carpet"], ["plastic bag", "paper bag"], ["saddle", "saddle on an animal"], ["saddle blanket", "saddlecloth", "horse blanket"], ["saddlebag"], ["safety pin"], ["sail"], ["salad"], ["salad plate", "salad bowl"], ["salami"], ["salmon", "salmon fish"], ["salmon", "salmon food"], ["salsa"], ["saltshaker"], ["sandal", "sandal type of shoe"], ["sandwich"], ["satchel"], ["saucepan"], ["saucer"], ["sausage"], ["sawhorse", "sawbuck"], ["saxophone"], ["scale", "scale measuring instrument"], ["scarecrow", "strawman"], ["scarf"], ["school bus"], ["scissors"], ["scoreboard"], ["scraper"], ["screwdriver"], ["scrubbing brush"], ["sculpture"], ["seabird", "seafowl"], ["seahorse"], ["seaplane", "hydroplane"], ["seashell"], ["sewing machine"], ["shaker"], ["shampoo"], ["shark"], ["sharpener"], ["Sharpie"], ["shaver", "shaver electric", "electric shaver", "electric razor"], ["shaving cream", "shaving soap"], ["shawl"], ["shears"], ["sheep"], ["shepherd dog", "sheepdog"], ["sherbert", "sherbet"], ["shield"], ["shirt"], ["shoe", "sneaker", "sneaker type of shoe", "tennis shoe"], ["shopping bag"], ["shopping cart"], ["short pants", "shorts", "shorts clothing", "trunks", "trunks clothing"], ["shot glass"], ["shoulder bag"], ["shovel"], ["shower head"], ["shower cap"], ["shower curtain"], ["shredder", "shredder for paper"], ["signboard"], ["silo"], ["sink"], ["skateboard"], ["skewer"], ["ski"], ["ski boot"], ["ski parka", "ski jacket"], ["ski pole"], ["skirt"], ["skullcap"], ["sled", "sledge", "sleigh"], ["sleeping bag"], ["sling", "sling bandage", "triangular bandage"], ["slipper", "slipper footwear", "carpet slipper", "carpet slipper footwear"], ["smoothie"], ["snake", "serpent"], ["snowboard"], ["snowman"], ["snowmobile"], ["soap"], ["soccer ball"], ["sock"], ["sofa", "couch", "lounge"], ["softball"], ["solar array", "solar battery", "solar panel"], ["sombrero"], ["soup"], ["soup bowl"], ["soupspoon"], ["sour cream", "soured cream"], ["soya milk", "soybean milk", "soymilk"], ["space shuttle"], ["sparkler", "sparkler fireworks"], ["spatula"], ["spear", "lance"], ["spectacles", "specs", "eyeglasses", "glasses"], ["spice rack"], ["spider"], ["crawfish", "crayfish"], ["sponge"], ["spoon"], ["sportswear", "athletic wear", "activewear"], ["spotlight"], ["squid", "squid food", "calamari", "calamary"], ["squirrel"], ["stagecoach"], ["stapler", "stapler stapling machine"], ["starfish", "sea star"], ["statue", "statue sculpture"], ["steak", "steak food"], ["steak knife"], ["steering wheel"], ["stepladder"], ["step stool"], ["stereo", "stereo sound system"], ["stew"], ["stirrer"], ["stirrup"], ["stool"], ["stop sign"], ["brake light"], ["stove", "kitchen stove", "range", "range kitchen appliance", "kitchen range", "cooking stove"], ["strainer"], ["strap"], ["straw", "straw for drinking", "drinking straw"], ["strawberry"], ["street sign"], ["streetlight", "street lamp"], ["string cheese"], ["stylus"], ["subwoofer"], ["sugar bowl"], ["sugarcane", "sugarcane plant"], ["suit", "suit clothing"], ["sunflower"], ["sunglasses"], ["sunhat"], ["surfboard"], ["sushi"], ["mop"], ["sweat pants"], ["sweatband"], ["sweater"], ["sweatshirt"], ["sweet potato"], ["swimsuit", "swimwear", "bathing suit", "swimming costume", "bathing costume", "swimming trunks", "bathing trunks"], ["sword"], ["syringe"], ["Tabasco sauce"], ["table-tennis table", "ping-pong table"], ["table"], ["table lamp"], ["tablecloth"], ["tachometer"], ["taco"], ["tag"], ["taillight", "rear light"], ["tambourine"], ["army tank", "armored combat vehicle", "armoured combat vehicle"], ["tank", "tank storage vessel", "storage tank"], ["tank top", "tank top clothing"], ["tape", "tape sticky cloth or paper"], ["tape measure", "measuring tape"], ["tapestry"], ["tarp"], ["tartan", "plaid"], ["tassel"], ["tea bag"], ["teacup"], ["teakettle"], ["teapot"], ["teddy bear"], ["telephone", "phone", "telephone set"], ["telephone booth", "phone booth", "call box", "telephone box", "telephone kiosk"], ["telephone pole", "telegraph pole", "telegraph post"], ["telephoto lens", "zoom lens"], ["television camera", "tv camera"], ["television set", "tv", "tv set"], ["tennis ball"], ["tennis racket"], ["tequila"], ["thermometer"], ["thermos bottle"], ["thermostat"], ["thimble"], ["thread", "yarn"], ["thumbtack", "drawing pin", "pushpin"], ["tiara"], ["tiger"], ["tights", "tights clothing", "leotards"], ["timer", "stopwatch"], ["tinfoil"], ["tinsel"], ["tissue paper"], ["toast", "toast food"], ["toaster"], ["toaster oven"], ["toilet"], ["toilet tissue", "toilet paper", "bathroom tissue"], ["tomato"], ["tongs"], ["toolbox"], ["toothbrush"], ["toothpaste"], ["toothpick"], ["cover"], ["tortilla"], ["tow truck"], ["towel"], ["towel rack", "towel rail", "towel bar"], ["toy"], ["tractor", "tractor farm equipment"], ["traffic light"], ["dirt bike"], ["trailer truck", "tractor trailer", "trucking rig", "articulated lorry", "semi truck"], ["train", "train railroad vehicle", "railroad train"], ["trampoline"], ["tray"], ["trench coat"], ["triangle", "triangle musical instrument"], ["tricycle"], ["tripod"], ["trousers", "pants", "pants clothing"], ["truck"], ["truffle", "truffle chocolate", "chocolate truffle"], ["trunk"], ["vat"], ["turban"], ["turkey", "turkey food"], ["turnip"], ["turtle"], ["turtleneck", "turtleneck clothing", "polo-neck"], ["typewriter"], ["umbrella"], ["underwear", "underclothes", "underclothing", "underpants"], ["unicycle"], ["urinal"], ["urn"], ["vacuum cleaner"], ["vase"], ["vending machine"], ["vent", "blowhole", "air vent"], ["vest", "waistcoat"], ["videotape"], ["vinegar"], ["violin", "fiddle"], ["vodka"], ["volleyball"], ["vulture"], ["waffle"], ["waffle iron"], ["wagon"], ["wagon wheel"], ["walking stick"], ["wall clock"], ["wall socket", "wall plug", "electric outlet", "electrical outlet", "outlet", "electric receptacle"], ["wallet", "billfold"], ["walrus"], ["wardrobe"], ["washbasin", "basin", "basin for washing", "washbowl", "washstand", "handbasin"], ["automatic washer", "washing machine"], ["watch", "wristwatch"], ["water bottle"], ["water cooler"], ["water faucet", "water tap", "tap", "tap water faucet"], ["water heater", "hot-water heater"], ["water jug"], ["water gun", "squirt gun"], ["water scooter", "sea scooter", "jet ski"], ["water ski"], ["water tower"], ["watering can"], ["watermelon"], ["weathervane", "vane", "vane weathervane", "wind vane"], ["webcam"], ["wedding cake", "bridecake"], ["wedding ring", "wedding band"], ["wet suit"], ["wheel"], ["wheelchair"], ["whipped cream"], ["whistle"], ["wig"], ["wind chime"], ["windmill"], ["window box", "window box for plants"], ["windshield wiper", "windscreen wiper", "wiper", "wiper for windshield or screen"], ["windsock", "air sock", "air-sleeve", "wind sleeve", "wind cone"], ["wine bottle"], ["wine bucket", "wine cooler"], ["wineglass"], ["blinder", "blinder for horses"], ["wok"], ["wolf"], ["wooden spoon"], ["wreath"], ["wrench", "spanner"], ["wristband"], ["wristlet", "wrist band"], ["yacht"], ["yogurt", "yoghurt", "yoghourt"], ["yoke", "yoke animal equipment"], ["zebra"], ["zucchini", "courgette"]] \ No newline at end of file diff --git a/models/YOLO-World/data/texts/obj365v1_class_texts.json b/models/YOLO-World/data/texts/obj365v1_class_texts.json new file mode 100644 index 0000000000000000000000000000000000000000..bddc11c0b9721bb4b7addc9a557a2eed1c9fe0fc --- /dev/null +++ b/models/YOLO-World/data/texts/obj365v1_class_texts.json @@ -0,0 +1 @@ +[["person"], ["sneakers"], ["chair"], ["hat"], ["lamp"], ["bottle"], ["cabinet", "shelf"], ["cup"], ["car"], ["glasses"], ["picture", "frame"], ["desk"], ["handbag"], ["street lights"], ["book"], ["plate"], ["helmet"], ["leather shoes"], ["pillow"], ["glove"], ["potted plant"], ["bracelet"], ["flower"], ["tv"], ["storage box"], ["vase"], ["bench"], ["wine glass"], ["boots"], ["bowl"], ["dining table"], ["umbrella"], ["boat"], ["flag"], ["speaker"], ["trash bin", "can"], ["stool"], ["backpack"], ["couch"], ["belt"], ["carpet"], ["basket"], ["towel", "napkin"], ["slippers"], ["barrel", "bucket"], ["coffee table"], ["suv"], ["toy"], ["tie"], ["bed"], ["traffic light"], ["pen", "pencil"], ["microphone"], ["sandals"], ["canned"], ["necklace"], ["mirror"], ["faucet"], ["bicycle"], ["bread"], ["high heels"], ["ring"], ["van"], ["watch"], ["sink"], ["horse"], ["fish"], ["apple"], ["camera"], ["candle"], ["teddy bear"], ["cake"], ["motorcycle"], ["wild bird"], ["laptop"], ["knife"], ["traffic sign"], ["cell phone"], ["paddle"], ["truck"], ["cow"], ["power outlet"], ["clock"], ["drum"], ["fork"], ["bus"], ["hanger"], ["nightstand"], ["pot", "pan"], ["sheep"], ["guitar"], ["traffic cone"], ["tea pot"], ["keyboard"], ["tripod"], ["hockey"], ["fan"], ["dog"], ["spoon"], ["blackboard", "whiteboard"], ["balloon"], ["air conditioner"], ["cymbal"], ["mouse"], ["telephone"], ["pickup truck"], ["orange"], ["banana"], ["airplane"], ["luggage"], ["skis"], ["soccer"], ["trolley"], ["oven"], ["remote"], ["baseball glove"], ["paper towel"], ["refrigerator"], ["train"], ["tomato"], ["machinery vehicle"], ["tent"], ["shampoo", "shower gel"], ["head phone"], ["lantern"], ["donut"], ["cleaning products"], ["sailboat"], ["tangerine"], ["pizza"], ["kite"], ["computer box"], ["elephant"], ["toiletries"], ["gas stove"], ["broccoli"], ["toilet"], ["stroller"], ["shovel"], ["baseball bat"], ["microwave"], ["skateboard"], ["surfboard"], ["surveillance camera"], ["gun"], ["life saver"], ["cat"], ["lemon"], ["liquid soap"], ["zebra"], ["duck"], ["sports car"], ["giraffe"], ["pumpkin"], ["piano"], ["stop sign"], ["radiator"], ["converter"], ["tissue"], ["carrot"], ["washing machine"], ["vent"], ["cookies"], ["cutting", "chopping board"], ["tennis racket"], ["candy"], ["skating and skiing shoes"], ["scissors"], ["folder"], ["baseball"], ["strawberry"], ["bow tie"], ["pigeon"], ["pepper"], ["coffee machine"], ["bathtub"], ["snowboard"], ["suitcase"], ["grapes"], ["ladder"], ["pear"], ["american football"], ["basketball"], ["potato"], ["paint brush"], ["printer"], ["billiards"], ["fire hydrant"], ["goose"], ["projector"], ["sausage"], ["fire extinguisher"], ["extension cord"], ["facial mask"], ["tennis ball"], ["chopsticks"], ["electronic stove and gas stove"], ["pie"], ["frisbee"], ["kettle"], ["hamburger"], ["golf club"], ["cucumber"], ["clutch"], ["blender"], ["tong"], ["slide"], ["hot dog"], ["toothbrush"], ["facial cleanser"], ["mango"], ["deer"], ["egg"], ["violin"], ["marker"], ["ship"], ["chicken"], ["onion"], ["ice cream"], ["tape"], ["wheelchair"], ["plum"], ["bar soap"], ["scale"], ["watermelon"], ["cabbage"], ["router", "modem"], ["golf ball"], ["pine apple"], ["crane"], ["fire truck"], ["peach"], ["cello"], ["notepaper"], ["tricycle"], ["toaster"], ["helicopter"], ["green beans"], ["brush"], ["carriage"], ["cigar"], ["earphone"], ["penguin"], ["hurdle"], ["swing"], ["radio"], ["cd"], ["parking meter"], ["swan"], ["garlic"], ["french fries"], ["horn"], ["avocado"], ["saxophone"], ["trumpet"], ["sandwich"], ["cue"], ["kiwi fruit"], ["bear"], ["fishing rod"], ["cherry"], ["tablet"], ["green vegetables"], ["nuts"], ["corn"], ["key"], ["screwdriver"], ["globe"], ["broom"], ["pliers"], ["volleyball"], ["hammer"], ["eggplant"], ["trophy"], ["dates"], ["board eraser"], ["rice"], ["tape measure", "ruler"], ["dumbbell"], ["hamimelon"], ["stapler"], ["camel"], ["lettuce"], ["goldfish"], ["meat balls"], ["medal"], ["toothpaste"], ["antelope"], ["shrimp"], ["rickshaw"], ["trombone"], ["pomegranate"], ["coconut"], ["jellyfish"], ["mushroom"], ["calculator"], ["treadmill"], ["butterfly"], ["egg tart"], ["cheese"], ["pig"], ["pomelo"], ["race car"], ["rice cooker"], ["tuba"], ["crosswalk sign"], ["papaya"], ["hair drier"], ["green onion"], ["chips"], ["dolphin"], ["sushi"], ["urinal"], ["donkey"], ["electric drill"], ["spring rolls"], ["tortoise", "turtle"], ["parrot"], ["flute"], ["measuring cup"], ["shark"], ["steak"], ["poker card"], ["binoculars"], ["llama"], ["radish"], ["noodles"], ["yak"], ["mop"], ["crab"], ["microscope"], ["barbell"], ["bread", "bun"], ["baozi"], ["lion"], ["red cabbage"], ["polar bear"], ["lighter"], ["seal"], ["mangosteen"], ["comb"], ["eraser"], ["pitaya"], ["scallop"], ["pencil case"], ["saw"], ["table tennis paddle"], ["okra"], ["starfish"], ["eagle"], ["monkey"], ["durian"], ["game board"], ["rabbit"], ["french horn"], ["ambulance"], ["asparagus"], ["hoverboard"], ["pasta"], ["target"], ["hotair balloon"], ["chainsaw"], ["lobster"], ["iron"], ["flashlight"]] \ No newline at end of file diff --git a/models/YOLO-World/demo/README.md b/models/YOLO-World/demo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c6f607c5044ecb85c52cc5254006382bb648a4b1 --- /dev/null +++ b/models/YOLO-World/demo/README.md @@ -0,0 +1,65 @@ +## YOLO-World Demo + +### Getting Started + +Setting `PYTHONPATH` as the path to `YOLO-World` and run: + +```bash +PYTHONPATH=/xxxx/YOLO-World python demo/yyyy_demo.py +# or directly +PYTHONPATH=./ python demo/yyyy_demo.py +``` + +#### Gradio Demo + +We provide the [Gradio](https://www.gradio.app/) demo for local devices: + +```bash +pip install gradio==4.16.0 +python demo/demo.py path/to/config path/to/weights +``` + +Additionaly, you can use a Dockerfile to build an image with gradio. As a prerequisite, make sure you have respective drivers installed alongside [nvidia-container-runtime](https://stackoverflow.com/questions/59691207/docker-build-with-nvidia-runtime). Replace MODEL_NAME and WEIGHT_NAME with the respective values or ommit this and use default values from the [Dockerfile](Dockerfile#3) + +```bash +docker build --build-arg="MODEL=MODEL_NAME" --build-arg="WEIGHT=WEIGHT_NAME" -t yolo_demo . +docker run --runtime nvidia -p 8080:8080 +``` + +#### Image Demo + +We provide a simple image demo for inference on images with visualization outputs. + +```bash +python demo/image_demo.py path/to/config path/to/weights image/path/directory 'person,dog,cat' --topk 100 --threshold 0.005 --output-dir demo_outputs +``` + +**Notes:** +* The `image` can be a directory or a single image. +* The `texts` can be a string of categories (noun phrases) which is separated by a comma. We also support `txt` file in which each line contains a category ( noun phrases). +* The `topk` and `threshold` control the number of predictions and the confidence threshold. + + +#### Video Demo + +The `video_demo` has similar hyper-parameters with `image_demo`. + +```bash +python demo/video_demo.py path/to/config path/to/weights video_path 'person,dog' --out out_video_path +``` + +### FAQ + +> 1. `Failed to custom import!` +```bash + File "simple_demo.py", line 37, in + cfg = Config.fromfile(config_file) + File "/data/miniconda3/envs/det/lib/python3.8/site-packages/mmengine/config/config.py", line 183, in fromfile + raise ImportError('Failed to custom import!') from e +ImportError: Failed to custom import! +``` +**Solution:** + +```bash +PYTHONPATH=/xxxx/YOLO-World python demo/simple_demo.py +``` \ No newline at end of file diff --git a/models/YOLO-World/demo/gradio_demo.py b/models/YOLO-World/demo/gradio_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..739e97beaa8641885f25fa2a4d1bdcbbfc95c20e --- /dev/null +++ b/models/YOLO-World/demo/gradio_demo.py @@ -0,0 +1,253 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import os +import sys +import argparse +import os.path as osp +from io import BytesIO +from functools import partial + +import cv2 +import onnx +import torch +import onnxsim +import numpy as np +import gradio as gr +from PIL import Image +import supervision as sv +from torchvision.ops import nms +from mmengine.runner import Runner +from mmengine.dataset import Compose +from mmengine.runner.amp import autocast +from mmengine.config import Config, DictAction, ConfigDict +from mmdet.datasets import CocoDataset +from mmyolo.registry import RUNNERS + +sys.path.append('./deploy') +from easydeploy import model as EM + +BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1) +MASK_ANNOTATOR = sv.MaskAnnotator() + + +class LabelAnnotator(sv.LabelAnnotator): + + @staticmethod + def resolve_text_background_xyxy( + center_coordinates, + text_wh, + position, + ): + center_x, center_y = center_coordinates + text_w, text_h = text_wh + return center_x, center_y, center_x + text_w, center_y + text_h + + +LABEL_ANNOTATOR = LabelAnnotator(text_padding=4, + text_scale=0.5, + text_thickness=1) + + +def parse_args(): + parser = argparse.ArgumentParser(description='YOLO-World Demo') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing evaluation metrics', + default='output') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def run_image(runner, + image, + text, + max_num_boxes, + score_thr, + nms_thr, + image_path='./work_dirs/demo.png'): + # image.save(image_path) + texts = [[t.strip()] for t in text.split(',')] + [[' ']] + data_info = dict(img_id=0, img=np.array(image), texts=texts) + data_info = runner.pipeline(data_info) + data_batch = dict(inputs=data_info['inputs'].unsqueeze(0), + data_samples=[data_info['data_samples']]) + + with autocast(enabled=False), torch.no_grad(): + output = runner.model.test_step(data_batch)[0] + pred_instances = output.pred_instances + + keep = nms(pred_instances.bboxes, + pred_instances.scores, + iou_threshold=nms_thr) + pred_instances = pred_instances[keep] + pred_instances = pred_instances[pred_instances.scores.float() > score_thr] + + if len(pred_instances.scores) > max_num_boxes: + indices = pred_instances.scores.float().topk(max_num_boxes)[1] + pred_instances = pred_instances[indices] + + pred_instances = pred_instances.cpu().numpy() + if 'masks' in pred_instances: + masks = pred_instances['masks'] + else: + masks = None + detections = sv.Detections(xyxy=pred_instances['bboxes'], + class_id=pred_instances['labels'], + confidence=pred_instances['scores'], + mask=masks) + labels = [ + f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in + zip(detections.class_id, detections.confidence) + ] + + image = np.array(image) + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # Convert RGB to BGR + image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections) + image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels) + if masks is not None: + image = MASK_ANNOTATOR.annotate(image, detections) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Convert BGR to RGB + image = Image.fromarray(image) + return image + + +def export_model(runner, text, max_num_boxes, score_thr, nms_thr): + + backend = EM.MMYOLOBackend.ONNXRUNTIME + postprocess_cfg = ConfigDict(pre_top_k=10 * max_num_boxes, + keep_top_k=max_num_boxes, + iou_threshold=nms_thr, + score_threshold=score_thr) + + base_model = runner.model + + texts = [[t.strip() for t in text.split(',')] + [' ']] + base_model.reparameterize(texts) + deploy_model = EM.DeployModel(baseModel=base_model, + backend=backend, + postprocess_cfg=postprocess_cfg) + deploy_model.eval() + + device = (next(iter(base_model.parameters()))).device + fake_input = torch.ones([1, 3, 640, 640], device=device) + deploy_model(fake_input) + + save_onnx_path = os.path.join( + args.work_dir, + os.path.basename(args.checkpoint).replace('pth', 'onnx')) + # export onnx + with BytesIO() as f: + output_names = ['num_dets', 'boxes', 'scores', 'labels'] + torch.onnx.export(deploy_model, + fake_input, + f, + input_names=['images'], + output_names=output_names, + opset_version=12) + f.seek(0) + onnx_model = onnx.load(f) + onnx.checker.check_model(onnx_model) + onnx_model, check = onnxsim.simplify(onnx_model) + onnx.save(onnx_model, save_onnx_path) + return gr.update(visible=True), save_onnx_path + + +def demo(runner, args): + with gr.Blocks(title="YOLO-World") as demo: + with gr.Row(): + gr.Markdown('

YOLO-World: Real-Time Open-Vocabulary ' + 'Object Detector

') + with gr.Row(): + with gr.Column(scale=0.3): + with gr.Row(): + image = gr.Image(type='pil', label='input image') + input_text = gr.Textbox( + lines=7, + label='Enter the classes to be detected, ' + 'separated by comma', + value=', '.join(CocoDataset.METAINFO['classes']), + elem_id='textbox') + with gr.Row(): + submit = gr.Button('Submit') + clear = gr.Button('Clear') + with gr.Row(): + export = gr.Button('Deploy and Export ONNX Model') + with gr.Row(): + gr.Markdown( + "It takes a few seconds to generate the ONNX file! YOLO-World-Seg (segmentation) is not supported now" + ) + out_download = gr.File(visible=False) + max_num_boxes = gr.Slider(minimum=1, + maximum=300, + value=100, + step=1, + interactive=True, + label='Maximum Number Boxes') + score_thr = gr.Slider(minimum=0, + maximum=1, + value=0.05, + step=0.001, + interactive=True, + label='Score Threshold') + nms_thr = gr.Slider(minimum=0, + maximum=1, + value=0.7, + step=0.001, + interactive=True, + label='NMS Threshold') + with gr.Column(scale=0.7): + output_image = gr.Image(type='pil', label='output image') + + submit.click(partial(run_image, runner), + [image, input_text, max_num_boxes, score_thr, nms_thr], + [output_image]) + clear.click(lambda: [None, '', None], None, + [image, input_text, output_image]) + + export.click(partial(export_model, runner), + [input_text, max_num_boxes, score_thr, nms_thr], + [out_download, out_download]) + + demo.launch(server_name='0.0.0.0', + server_port=8080) # port 80 does not work for me + + +if __name__ == '__main__': + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + if args.work_dir is not None: + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + + if 'runner_type' not in cfg: + runner = Runner.from_cfg(cfg) + else: + runner = RUNNERS.build(cfg) + + runner.call_hook('before_run') + runner.load_or_resume() + pipeline = cfg.test_dataloader.dataset.pipeline + pipeline[0].type = 'mmdet.LoadImageFromNDArray' + runner.pipeline = Compose(pipeline) + runner.model.eval() + demo(runner, args) diff --git a/models/YOLO-World/demo/image_demo.py b/models/YOLO-World/demo/image_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..20d225b638bb64d824a9d7e4bf13c609e61f0294 --- /dev/null +++ b/models/YOLO-World/demo/image_demo.py @@ -0,0 +1,220 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import os +import cv2 +import argparse +import os.path as osp + +import torch +from mmengine.config import Config, DictAction +from mmengine.runner.amp import autocast +from mmengine.dataset import Compose +from mmengine.utils import ProgressBar +from mmdet.apis import init_detector +from mmdet.utils import get_test_pipeline_cfg + +import supervision as sv + +BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1) +MASK_ANNOTATOR = sv.MaskAnnotator() + + +class LabelAnnotator(sv.LabelAnnotator): + + @staticmethod + def resolve_text_background_xyxy( + center_coordinates, + text_wh, + position, + ): + center_x, center_y = center_coordinates + text_w, text_h = text_wh + return center_x, center_y, center_x + text_w, center_y + text_h + + +LABEL_ANNOTATOR = LabelAnnotator(text_padding=4, + text_scale=0.5, + text_thickness=1) + + +def parse_args(): + parser = argparse.ArgumentParser(description='YOLO-World Demo') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('image', help='image path, include image file or dir.') + parser.add_argument( + 'text', + help= + 'text prompts, including categories separated by a comma or a txt file with each line as a prompt.' + ) + parser.add_argument('--topk', + default=100, + type=int, + help='keep topk predictions.') + parser.add_argument('--threshold', + default=0.1, + type=float, + help='confidence score threshold for predictions.') + parser.add_argument('--device', + default='cuda:0', + help='device used for inference.') + parser.add_argument('--show', + action='store_true', + help='show the detection results.') + parser.add_argument( + '--annotation', + action='store_true', + help='save the annotated detection results as yolo text format.') + parser.add_argument('--amp', + action='store_true', + help='use mixed precision for inference.') + parser.add_argument('--output-dir', + default='demo_outputs', + help='the directory to save outputs') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def inference_detector(model, + image, + texts, + test_pipeline, + max_dets=100, + score_thr=0.3, + output_dir='./work_dir', + use_amp=False, + show=False, + annotation=False): + data_info = dict(img_id=0, img_path=image, texts=texts) + data_info = test_pipeline(data_info) + data_batch = dict(inputs=data_info['inputs'].unsqueeze(0), + data_samples=[data_info['data_samples']]) + + with autocast(enabled=use_amp), torch.no_grad(): + output = model.test_step(data_batch)[0] + pred_instances = output.pred_instances + pred_instances = pred_instances[pred_instances.scores.float() > + score_thr] + + if len(pred_instances.scores) > max_dets: + indices = pred_instances.scores.float().topk(max_dets)[1] + pred_instances = pred_instances[indices] + + pred_instances = pred_instances.cpu().numpy() + + if 'masks' in pred_instances: + masks = pred_instances['masks'] + else: + masks = None + + detections = sv.Detections(xyxy=pred_instances['bboxes'], + class_id=pred_instances['labels'], + confidence=pred_instances['scores'], + mask=masks) + + labels = [ + f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in + zip(detections.class_id, detections.confidence) + ] + + # label images + image = cv2.imread(image_path) + anno_image = image.copy() + image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections) + image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels) + if masks is not None: + image = MASK_ANNOTATOR.annotate(image, detections) + cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image) + + if annotation: + images_dict = {} + annotations_dict = {} + + images_dict[osp.basename(image_path)] = anno_image + annotations_dict[osp.basename(image_path)] = detections + + ANNOTATIONS_DIRECTORY = os.makedirs(r"./annotations", exist_ok=True) + + MIN_IMAGE_AREA_PERCENTAGE = 0.002 + MAX_IMAGE_AREA_PERCENTAGE = 0.80 + APPROXIMATION_PERCENTAGE = 0.75 + + sv.DetectionDataset( + classes=texts, images=images_dict, + annotations=annotations_dict).as_yolo( + annotations_directory_path=ANNOTATIONS_DIRECTORY, + min_image_area_percentage=MIN_IMAGE_AREA_PERCENTAGE, + max_image_area_percentage=MAX_IMAGE_AREA_PERCENTAGE, + approximation_percentage=APPROXIMATION_PERCENTAGE) + + if show: + cv2.imshow('Image', image) # Provide window name + k = cv2.waitKey(0) + if k == 27: + # wait for ESC key to exit + cv2.destroyAllWindows() + + +if __name__ == '__main__': + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + # init model + cfg.load_from = args.checkpoint + model = init_detector(cfg, checkpoint=args.checkpoint, device=args.device) + + # init test pipeline + test_pipeline_cfg = get_test_pipeline_cfg(cfg=cfg) + # test_pipeline[0].type = 'mmdet.LoadImageFromNDArray' + test_pipeline = Compose(test_pipeline_cfg) + + if args.text.endswith('.txt'): + with open(args.text) as f: + lines = f.readlines() + texts = [[t.rstrip('\r\n')] for t in lines] + [[' ']] + else: + texts = [[t.strip()] for t in args.text.split(',')] + [[' ']] + + output_dir = args.output_dir + if not osp.exists(output_dir): + os.mkdir(output_dir) + + # load images + if not osp.isfile(args.image): + images = [ + osp.join(args.image, img) for img in os.listdir(args.image) + if img.endswith('.png') or img.endswith('.jpg') + ] + else: + images = [args.image] + + # reparameterize texts + model.reparameterize(texts) + progress_bar = ProgressBar(len(images)) + for image_path in images: + inference_detector(model, + image_path, + texts, + test_pipeline, + args.topk, + args.threshold, + output_dir=output_dir, + use_amp=args.amp, + show=args.show, + annotation=args.annotation) + progress_bar.update() diff --git a/models/YOLO-World/demo/inference.ipynb b/models/YOLO-World/demo/inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..19cc7b1480cc0b0e732762cc166cf52b568f17d2 --- /dev/null +++ b/models/YOLO-World/demo/inference.ipynb @@ -0,0 +1,2836 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "PorcLK9OylD6" + }, + "source": [ + " ![yolo_logo.png]()\n", + "\n", + "\n", + " This YOLO-World notebook is a Inferencing notebook presenting Real-Time Open-Vocabulary Object Detection.\n", + "\n", + "We hope that the resources in this notebook will help you for inferencing." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zw1OxP87zjCM" + }, + "source": [ + "## Setup\n", + "\n", + "Clone GitHub [repository](https://github.com/AILab-CVC/YOLO-World) and install dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rlsGVhscqjY0", + "outputId": "382bd549-11ee-4e1b-ec00-5e1401911bf4" + }, + "outputs": [], + "source": [ + "!git clone --recursive https://github.com/AILab-CVC/YOLO-World\n", + "%cd YOLO-World/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uE1GmCSAJHXC", + "outputId": "43654a80-834a-4d34-caa6-00ae9a030f2e" + }, + "outputs": [], + "source": [ + "import os\n", + "# Install certain version of requests, tqdm, rich for openxlab (fix for yolo_world)\n", + "# Install mmcv before avoding compiling of mmcv and shortining waiting time installs \"whl\" file\n", + "# Downgrade pytorch version for fast installing mmcv (your on prem should finish faster with latest pytorch)\n", + "\n", + "\n", + "if 'COLAB_GPU' in os.environ:\n", + " !pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu121 -q\n", + " !pip install requests==2.28.2 tqdm==4.65.0 rich==13.4.2 -q\n", + " %pip install -U openmim -q\n", + " !mim install \"mmengine>=0.7.0\" -q\n", + " !mim install \"mmcv\" -q\n", + "else:\n", + " !pip install torch wheel requests==2.28.2 tqdm==4.65.0 rich==13.4.2 -q\n", + "\n", + "!pip install -e . -vv -q" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "v_Pgd1urgbj8" + }, + "outputs": [], + "source": [ + "if 'COLAB_GPU' in os.environ:\n", + " # Restart colab session (required for yolo_world to work in google colab)\n", + " quit()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZWq1gYXD2c4n" + }, + "source": [ + "## Pretrained Models\n", + "\n", + "Download Pretrained weights from Huggingface and set configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LGuy6naerg4e", + "outputId": "c57e8147-c06c-4782-f5bf-6aa3e8ddeb58" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "%cd YOLO-World/\n", + "if not os.path.exists(\"pretrained_weights\"):\n", + " os.makedirs(\"pretrained_weights\")\n", + "\n", + "# Download pretrained weights of YOLO-Worldv2-L\tO365+GoldG img_size=1280 model\n", + "!wget -P pretrained_weights/ https://huggingface.co/wondervictor/YOLO-World/resolve/main/yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth\n", + "!wget https://media.roboflow.com/notebooks/examples/dog.jpeg" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YECjGYE7-Ojg" + }, + "source": [ + "## Loading model configurations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "0bc8d02b9b0941f8b38f822b8552e54c", + "c5081cf89abc4514b81b0a705850b26f", + "93a7172913a84728a2919fe8796567c0", + "c50ae95e956d456395d05f12367ff8e3", + "c80456ab37c844b1beb074e74b17d8fb", + "4e47a4bc196e44dba1d7ce4faa5b74af", + "d0bad9ce27a742a49667d1cd58eea350", + "32f222c92f844a8ea780960c0e25a64c", + "06c1c81b5e8544d8aaca394f2e13539e", + "81df29145f4449339e75f78919147899", + "614d44b9730b4fe9a01305ac6c822388", + "1745520fa3834cbf900b1646fec5d6aa", + "768b536c12f84b1cb24d38675573baa2", + "569e8aabbcd74e4f9288bdebeb91400b", + "ad5431bc98784ee7adcf489989aba432", + "8614da2bade94ade978fe71994c777fa", + "6113de583b7a4a22bbbbfcf9a0ae6ea7", + "164ffff1e1944183b01d8cf76541556a", + "cd8f2fffa9a845cfbc2ce664647acda5", + "32b452668efa4b61acacd04d289edde0", + "e46b4e1e95da4d6f924a851265403480", + "ee06192a75fc403ba6d945da2efe4317", + "828a59ea87f34d4f8be9fa6fb63fe991", + "0becbcf3af914252b73937ffd789c533", + "8dc08812835f40e9a85c73ea57710029", + "bd6743fab19a4056a741fb923f1d66c6", + "cfc1570a53d4467397583e5614f35515", + "52d5fe0cd2514f87917ab8bcf923becf", + "0cee1b12a94c4fdaa97d7b0e57a9d8f6", + "ef7a3e2a70624fdfa2d590635e962ffd", + "794250f1a0b44831864f487cfe4be7b3", + "4b48981f033a4e0b89b3dc1cd088599e", + "46da2b5501cf471a99f354f17e85fc1d", + "084791b432c64ea383eeb10dd912d27f", + "c7e34cc6b3b54c36933cf4b21f32b469", + "961b3186964b4aa694ed50e601ca6ea6", + "9c7aebef36c94f659420f35c6951ac14", + "0381e7fdec3642d7af08a11841aaaba4", + "b69eb52454c64fb4bac7c9f008241d24", + "5dfaba276a3c480d837a75767300e96f", + "309c33ce179144ac9b23d6396f2fdcd6", + "dc6812fd13504f6bae35d81aaf2593fa", + "f7463653c82e41b087e794191e70c43e", + "7c53e4cff8344da8858060970b931a80", + "07cb92c22899453291baccd1f9b11a49", + "cbc909708fca4191a80767479a9c9c55", + "152972aaf5c7433da0a7ce4889694cf4", + "b769fadb878c43beaec040a779ba9067", + "483f26b6d2e54bb581e8a6392b8e1b39", + "b2dd4e48fb974451979e37fb99bbdf5b", + "53a11753fc664f12942c0a5a8f62e695", + "e908586e492443c6a28ed16750df6748", + "013ebfb59e88443d978bb2a4f3a68f96", + "265d430fcc604c6984d70b7e63f11e37", + "f55df7a2f0474b5ab6d0a23bcedf8cc2", + "8a23897839594ba4827c5a34463dbb35", + "ce8d0eadfac444a6b88e0ba16ab6f3f9", + "2d181d3861c64d0c9d71331751de111e", + "fd9cc05ff50e4463b004cacd050b59c3", + "dedf6f98735643d5bb53ff2e874137c7", + "5dbdd01ad0bd4939937fa32eb32182a1", + "fd7d351c2a5943cd9934b36be67481ca", + "f9ecf05660fa4512b4ff4cbb9d30f3e1", + "898c2d408c0a4b34851f7fbf537f45b1", + "d5797b57dcf04274a5f7077d104a62b6", + "ec8e16b5e78d4c55b100090ee7e23ddc", + "14b64b065ef740cbbff5587f062b04a3", + "5ede178010f54c259c9802698a599664", + "225ca87fffb54bfa9514513ace1fdbf1", + "cd906068e1cb46e4b5b62fc6267e8e6d", + "0aafe16d6e6d4561932cba3bed69f562", + "a81ab5c22fdc4ea99ebe396d3b43c552", + "8841ee0d44fe4073b3dc5237c8045185", + "d839228be8b84096a587489217630b7f", + "b76961c341d64959ae6ed7ad40f6abab", + "df073637968a4ca499a861f74869d45d", + "2f5098940d27496983565ddb3ab158bd" + ] + }, + "id": "tFQXnK-FsXlj", + "outputId": "6e6286aa-fbf8-44b1-94f6-2ccc661d040e" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import torch\n", + "from mmengine.config import Config\n", + "from mmengine.dataset import Compose\n", + "from mmengine.runner import Runner\n", + "from mmengine.runner.amp import autocast\n", + "from mmyolo.registry import RUNNERS\n", + "from torchvision.ops import nms\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " # load config\n", + " cfg = Config.fromfile(\n", + " \"configs/pretrain/yolo_world_v2_l_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py\"\n", + " )\n", + " cfg.work_dir = \".\"\n", + " cfg.load_from = \"pretrained_weights/yolo_world_v2_l_obj365v1_goldg_pretrain_1280ft-9babe3f6.pth\"\n", + " runner = Runner.from_cfg(cfg)\n", + " runner.call_hook(\"before_run\")\n", + " runner.load_or_resume()\n", + " pipeline = cfg.test_dataloader.dataset.pipeline\n", + " runner.pipeline = Compose(pipeline)\n", + "\n", + " # run model evaluation\n", + " runner.model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7syIir2qHoc9" + }, + "outputs": [], + "source": [ + "def colorstr(*input):\n", + " \"\"\"\n", + " Helper function for style logging\n", + " \"\"\"\n", + " *args, string = input if len(input) > 1 else (\"bold\", input[0])\n", + " colors = {\"bold\": \"\\033[1m\"}\n", + "\n", + " return \"\".join(colors[x] for x in args) + f\"{string}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NI1DSw4SCCUU" + }, + "source": [ + "# Run Image Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "ozklQl6BnsLI" + }, + "outputs": [], + "source": [ + "import PIL.Image\n", + "import cv2\n", + "import supervision as sv\n", + "\n", + "bounding_box_annotator = sv.BoxAnnotator()\n", + "label_annotator = sv.LabelAnnotator(text_position=sv.Position.CENTER)\n", + "mask_annotator = sv.MaskAnnotator()\n", + "\n", + "class_names = (\"person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, \"\n", + " \"traffic light, fire hydrant, stop sign, parking meter, bench, bird, \"\n", + " \"cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, \"\n", + " \"backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, \"\n", + " \"sports ball, kite, baseball bat, baseball glove, skateboard, \"\n", + " \"surfboard, tennis racket, bottle, wine glass, cup, fork, knife, \"\n", + " \"spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, \"\n", + " \"hot dog, pizza, donut, cake, chair, couch, potted plant, bed, \"\n", + " \"dining table, toilet, tv, laptop, mouse, remote, keyboard, \"\n", + " \"cell phone, microwave, oven, toaster, sink, refrigerator, book, \"\n", + " \"clock, vase, scissors, teddy bear, hair drier, toothbrush\")\n", + "\n", + "class_names2 = (\"dog, eye, tongue, ear, leash\")\n", + "\n", + "\n", + "def run_image(\n", + " runner,\n", + " input_image,\n", + " max_num_boxes=100,\n", + " score_thr=0.05,\n", + " nms_thr=0.5,\n", + " output_image=\"output.png\",\n", + "):\n", + " output_image = \"runs/detect/\"+output_image\n", + " texts = [[t.strip()] for t in class_names.split(\",\")] + [[\" \"]]\n", + " data_info = runner.pipeline(dict(img_id=0, img_path=input_image,\n", + " texts=texts))\n", + "\n", + " data_batch = dict(\n", + " inputs=data_info[\"inputs\"].unsqueeze(0),\n", + " data_samples=[data_info[\"data_samples\"]],\n", + " )\n", + "\n", + " with autocast(enabled=False), torch.no_grad():\n", + " output = runner.model.test_step(data_batch)[0]\n", + " runner.model.class_names = texts\n", + " pred_instances = output.pred_instances\n", + "\n", + " # nms\n", + " keep_idxs = nms(pred_instances.bboxes, pred_instances.scores, iou_threshold=nms_thr)\n", + " pred_instances = pred_instances[keep_idxs]\n", + " pred_instances = pred_instances[pred_instances.scores.float() > score_thr]\n", + "\n", + " if len(pred_instances.scores) > max_num_boxes:\n", + " indices = pred_instances.scores.float().topk(max_num_boxes)[1]\n", + " pred_instances = pred_instances[indices]\n", + " output.pred_instances = pred_instances\n", + "\n", + " # predictions\n", + " pred_instances = pred_instances.cpu().numpy()\n", + "\n", + " if 'masks' in pred_instances:\n", + " masks = pred_instances['masks']\n", + " else:\n", + " masks = None\n", + " \n", + " detections = sv.Detections(\n", + " xyxy=pred_instances['bboxes'],\n", + " class_id=pred_instances['labels'],\n", + " confidence=pred_instances['scores']\n", + " )\n", + "\n", + " # label ids with confidence scores\n", + " labels = [\n", + " f\"{class_id} {confidence:0.2f}\"\n", + " for class_id, confidence\n", + " in zip(detections.class_id, detections.confidence)\n", + " ]\n", + "\n", + " # draw bounding box with label\n", + " image = PIL.Image.open(input_image)\n", + " svimage = np.array(image)\n", + " svimage = bounding_box_annotator.annotate(svimage, detections)\n", + " svimage = label_annotator.annotate(svimage, detections, labels)\n", + " if masks is not None:\n", + " svimage = mask_annotator.annotate(image, detections)\n", + "\n", + " # save output image\n", + " cv2.imwrite(output_image, svimage[:, :, ::-1])\n", + " print(f\"Results saved to {colorstr('bold', output_image)}\")\n", + "\n", + " return svimage[:, :, ::-1]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 979 + }, + "id": "-BL_keU8moAM", + "outputId": "78fe2957-1980-49b7-a64d-6a5d9f62cacf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Results saved to \u001b[1mruns/detect/output.png\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "img = run_image(runner,\"dog.jpeg\")\n", + "sv.plot_image(img)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "013ebfb59e88443d978bb2a4f3a68f96": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0381e7fdec3642d7af08a11841aaaba4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "06c1c81b5e8544d8aaca394f2e13539e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "07cb92c22899453291baccd1f9b11a49": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_cbc909708fca4191a80767479a9c9c55", + "IPY_MODEL_152972aaf5c7433da0a7ce4889694cf4", + "IPY_MODEL_b769fadb878c43beaec040a779ba9067" + ], + "layout": "IPY_MODEL_483f26b6d2e54bb581e8a6392b8e1b39" + } + }, + "084791b432c64ea383eeb10dd912d27f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c7e34cc6b3b54c36933cf4b21f32b469", + "IPY_MODEL_961b3186964b4aa694ed50e601ca6ea6", + "IPY_MODEL_9c7aebef36c94f659420f35c6951ac14" + ], + "layout": "IPY_MODEL_0381e7fdec3642d7af08a11841aaaba4" + } + }, + "0aafe16d6e6d4561932cba3bed69f562": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0bc8d02b9b0941f8b38f822b8552e54c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c5081cf89abc4514b81b0a705850b26f", + "IPY_MODEL_93a7172913a84728a2919fe8796567c0", + "IPY_MODEL_c50ae95e956d456395d05f12367ff8e3" + ], + "layout": "IPY_MODEL_c80456ab37c844b1beb074e74b17d8fb" + } + }, + "0becbcf3af914252b73937ffd789c533": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_52d5fe0cd2514f87917ab8bcf923becf", + "placeholder": "​", + "style": "IPY_MODEL_0cee1b12a94c4fdaa97d7b0e57a9d8f6", + "value": "vocab.json: 100%" + } + }, + "0cee1b12a94c4fdaa97d7b0e57a9d8f6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "14b64b065ef740cbbff5587f062b04a3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5ede178010f54c259c9802698a599664", + "IPY_MODEL_225ca87fffb54bfa9514513ace1fdbf1", + "IPY_MODEL_cd906068e1cb46e4b5b62fc6267e8e6d" + ], + "layout": "IPY_MODEL_0aafe16d6e6d4561932cba3bed69f562" + } + }, + "152972aaf5c7433da0a7ce4889694cf4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e908586e492443c6a28ed16750df6748", + "max": 2224041, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_013ebfb59e88443d978bb2a4f3a68f96", + "value": 2224041 + } + }, + "164ffff1e1944183b01d8cf76541556a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1745520fa3834cbf900b1646fec5d6aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_768b536c12f84b1cb24d38675573baa2", + "IPY_MODEL_569e8aabbcd74e4f9288bdebeb91400b", + "IPY_MODEL_ad5431bc98784ee7adcf489989aba432" + ], + "layout": "IPY_MODEL_8614da2bade94ade978fe71994c777fa" + } + }, + "225ca87fffb54bfa9514513ace1fdbf1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d839228be8b84096a587489217630b7f", + "max": 605247071, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b76961c341d64959ae6ed7ad40f6abab", + "value": 605247071 + } + }, + "265d430fcc604c6984d70b7e63f11e37": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2d181d3861c64d0c9d71331751de111e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f9ecf05660fa4512b4ff4cbb9d30f3e1", + "max": 389, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_898c2d408c0a4b34851f7fbf537f45b1", + "value": 389 + } + }, + "2f5098940d27496983565ddb3ab158bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "309c33ce179144ac9b23d6396f2fdcd6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "32b452668efa4b61acacd04d289edde0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "32f222c92f844a8ea780960c0e25a64c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "46da2b5501cf471a99f354f17e85fc1d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "483f26b6d2e54bb581e8a6392b8e1b39": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4b48981f033a4e0b89b3dc1cd088599e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4e47a4bc196e44dba1d7ce4faa5b74af": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "52d5fe0cd2514f87917ab8bcf923becf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "53a11753fc664f12942c0a5a8f62e695": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "569e8aabbcd74e4f9288bdebeb91400b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cd8f2fffa9a845cfbc2ce664647acda5", + "max": 4186, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_32b452668efa4b61acacd04d289edde0", + "value": 4186 + } + }, + "5dbdd01ad0bd4939937fa32eb32182a1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5dfaba276a3c480d837a75767300e96f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5ede178010f54c259c9802698a599664": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a81ab5c22fdc4ea99ebe396d3b43c552", + "placeholder": "​", + "style": "IPY_MODEL_8841ee0d44fe4073b3dc5237c8045185", + "value": "pytorch_model.bin: 100%" + } + }, + "6113de583b7a4a22bbbbfcf9a0ae6ea7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "614d44b9730b4fe9a01305ac6c822388": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "768b536c12f84b1cb24d38675573baa2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6113de583b7a4a22bbbbfcf9a0ae6ea7", + "placeholder": "​", + "style": "IPY_MODEL_164ffff1e1944183b01d8cf76541556a", + "value": "config.json: 100%" + } + }, + "794250f1a0b44831864f487cfe4be7b3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7c53e4cff8344da8858060970b931a80": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "81df29145f4449339e75f78919147899": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "828a59ea87f34d4f8be9fa6fb63fe991": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0becbcf3af914252b73937ffd789c533", + "IPY_MODEL_8dc08812835f40e9a85c73ea57710029", + "IPY_MODEL_bd6743fab19a4056a741fb923f1d66c6" + ], + "layout": "IPY_MODEL_cfc1570a53d4467397583e5614f35515" + } + }, + "8614da2bade94ade978fe71994c777fa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8841ee0d44fe4073b3dc5237c8045185": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "898c2d408c0a4b34851f7fbf537f45b1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8a23897839594ba4827c5a34463dbb35": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ce8d0eadfac444a6b88e0ba16ab6f3f9", + "IPY_MODEL_2d181d3861c64d0c9d71331751de111e", + "IPY_MODEL_fd9cc05ff50e4463b004cacd050b59c3" + ], + "layout": "IPY_MODEL_dedf6f98735643d5bb53ff2e874137c7" + } + }, + "8dc08812835f40e9a85c73ea57710029": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ef7a3e2a70624fdfa2d590635e962ffd", + "max": 862328, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_794250f1a0b44831864f487cfe4be7b3", + "value": 862328 + } + }, + "93a7172913a84728a2919fe8796567c0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_32f222c92f844a8ea780960c0e25a64c", + "max": 568, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_06c1c81b5e8544d8aaca394f2e13539e", + "value": 568 + } + }, + "961b3186964b4aa694ed50e601ca6ea6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_309c33ce179144ac9b23d6396f2fdcd6", + "max": 524657, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_dc6812fd13504f6bae35d81aaf2593fa", + "value": 524657 + } + }, + "9c7aebef36c94f659420f35c6951ac14": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f7463653c82e41b087e794191e70c43e", + "placeholder": "​", + "style": "IPY_MODEL_7c53e4cff8344da8858060970b931a80", + "value": " 525k/525k [00:00<00:00, 28.1MB/s]" + } + }, + "a81ab5c22fdc4ea99ebe396d3b43c552": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ad5431bc98784ee7adcf489989aba432": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e46b4e1e95da4d6f924a851265403480", + "placeholder": "​", + "style": "IPY_MODEL_ee06192a75fc403ba6d945da2efe4317", + "value": " 4.19k/4.19k [00:00<00:00, 161kB/s]" + } + }, + "b2dd4e48fb974451979e37fb99bbdf5b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b69eb52454c64fb4bac7c9f008241d24": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b76961c341d64959ae6ed7ad40f6abab": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b769fadb878c43beaec040a779ba9067": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_265d430fcc604c6984d70b7e63f11e37", + "placeholder": "​", + "style": "IPY_MODEL_f55df7a2f0474b5ab6d0a23bcedf8cc2", + "value": " 2.22M/2.22M [00:00<00:00, 8.62MB/s]" + } + }, + "bd6743fab19a4056a741fb923f1d66c6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4b48981f033a4e0b89b3dc1cd088599e", + "placeholder": "​", + "style": "IPY_MODEL_46da2b5501cf471a99f354f17e85fc1d", + "value": " 862k/862k [00:00<00:00, 1.24MB/s]" + } + }, + "c5081cf89abc4514b81b0a705850b26f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4e47a4bc196e44dba1d7ce4faa5b74af", + "placeholder": "​", + "style": "IPY_MODEL_d0bad9ce27a742a49667d1cd58eea350", + "value": "tokenizer_config.json: 100%" + } + }, + "c50ae95e956d456395d05f12367ff8e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_81df29145f4449339e75f78919147899", + "placeholder": "​", + "style": "IPY_MODEL_614d44b9730b4fe9a01305ac6c822388", + "value": " 568/568 [00:00<00:00, 24.3kB/s]" + } + }, + "c7e34cc6b3b54c36933cf4b21f32b469": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b69eb52454c64fb4bac7c9f008241d24", + "placeholder": "​", + "style": "IPY_MODEL_5dfaba276a3c480d837a75767300e96f", + "value": "merges.txt: 100%" + } + }, + "c80456ab37c844b1beb074e74b17d8fb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cbc909708fca4191a80767479a9c9c55": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b2dd4e48fb974451979e37fb99bbdf5b", + "placeholder": "​", + "style": "IPY_MODEL_53a11753fc664f12942c0a5a8f62e695", + "value": "tokenizer.json: 100%" + } + }, + "cd8f2fffa9a845cfbc2ce664647acda5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cd906068e1cb46e4b5b62fc6267e8e6d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df073637968a4ca499a861f74869d45d", + "placeholder": "​", + "style": "IPY_MODEL_2f5098940d27496983565ddb3ab158bd", + "value": " 605M/605M [00:02<00:00, 182MB/s]" + } + }, + "ce8d0eadfac444a6b88e0ba16ab6f3f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5dbdd01ad0bd4939937fa32eb32182a1", + "placeholder": "​", + "style": "IPY_MODEL_fd7d351c2a5943cd9934b36be67481ca", + "value": "special_tokens_map.json: 100%" + } + }, + "cfc1570a53d4467397583e5614f35515": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d0bad9ce27a742a49667d1cd58eea350": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d5797b57dcf04274a5f7077d104a62b6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d839228be8b84096a587489217630b7f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dc6812fd13504f6bae35d81aaf2593fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dedf6f98735643d5bb53ff2e874137c7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "df073637968a4ca499a861f74869d45d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e46b4e1e95da4d6f924a851265403480": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e908586e492443c6a28ed16750df6748": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ec8e16b5e78d4c55b100090ee7e23ddc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ee06192a75fc403ba6d945da2efe4317": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ef7a3e2a70624fdfa2d590635e962ffd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f55df7a2f0474b5ab6d0a23bcedf8cc2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f7463653c82e41b087e794191e70c43e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f9ecf05660fa4512b4ff4cbb9d30f3e1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fd7d351c2a5943cd9934b36be67481ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fd9cc05ff50e4463b004cacd050b59c3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d5797b57dcf04274a5f7077d104a62b6", + "placeholder": "​", + "style": "IPY_MODEL_ec8e16b5e78d4c55b100090ee7e23ddc", + "value": " 389/389 [00:00<00:00, 31.4kB/s]" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/models/YOLO-World/demo/simple_demo.py b/models/YOLO-World/demo/simple_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..fb797835db5be63d50cc5e213662d0039ae73cc4 --- /dev/null +++ b/models/YOLO-World/demo/simple_demo.py @@ -0,0 +1,61 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import os.path as osp + +import cv2 +import torch +from mmengine.config import Config +from mmengine.dataset import Compose +from mmdet.apis import init_detector +from mmdet.utils import get_test_pipeline_cfg + + +def inference(model, image, texts, test_pipeline, score_thr=0.3, max_dets=100): + image = cv2.imread(image) + image = image[:, :, [2, 1, 0]] + data_info = dict(img=image, img_id=0, texts=texts) + data_info = test_pipeline(data_info) + data_batch = dict(inputs=data_info['inputs'].unsqueeze(0), + data_samples=[data_info['data_samples']]) + with torch.no_grad(): + output = model.test_step(data_batch)[0] + pred_instances = output.pred_instances + # score thresholding + pred_instances = pred_instances[pred_instances.scores.float() > score_thr] + # max detections + if len(pred_instances.scores) > max_dets: + indices = pred_instances.scores.float().topk(max_dets)[1] + pred_instances = pred_instances[indices] + + pred_instances = pred_instances.cpu().numpy() + boxes = pred_instances['bboxes'] + labels = pred_instances['labels'] + scores = pred_instances['scores'] + label_texts = [texts[x][0] for x in labels] + return boxes, labels, label_texts, scores + + +if __name__ == "__main__": + + config_file = "configs/pretrain/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_1280ft_lvis_minival.py" + checkpoint = "weights/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth" + + cfg = Config.fromfile(config_file) + cfg.work_dir = osp.join('./work_dirs') + # init model + cfg.load_from = checkpoint + model = init_detector(cfg, checkpoint=checkpoint, device='cuda:0') + test_pipeline_cfg = get_test_pipeline_cfg(cfg=cfg) + test_pipeline_cfg[0].type = 'mmdet.LoadImageFromNDArray' + test_pipeline = Compose(test_pipeline_cfg) + + texts = [['person'], ['bus'], [' ']] + image = "demo/sample_images/bus.jpg" + print(f"starting to detect: {image}") + results = inference(model, image, texts, test_pipeline) + format_str = [ + f"obj-{idx}: {box}, label-{lbl}, class-{lbl_text}, score-{score}" + for idx, (box, lbl, lbl_text, score) in enumerate(zip(*results)) + ] + print("detecting results:") + for q in format_str: + print(q) diff --git a/models/YOLO-World/demo/video_demo.py b/models/YOLO-World/demo/video_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..9e6b13ec915429cec699868ffa189d1d9647195b --- /dev/null +++ b/models/YOLO-World/demo/video_demo.py @@ -0,0 +1,108 @@ +# Copyright (c) Tencent Inc. All rights reserved. +# This file is modifef from mmyolo/demo/video_demo.py +import argparse + +import cv2 +import mmcv +import torch +from mmengine.dataset import Compose +from mmdet.apis import init_detector +from mmengine.utils import track_iter_progress + +from mmyolo.registry import VISUALIZERS + + +def parse_args(): + parser = argparse.ArgumentParser(description='YOLO-World video demo') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument('video', help='video file path') + parser.add_argument( + 'text', + help= + 'text prompts, including categories separated by a comma or a txt file with each line as a prompt.' + ) + parser.add_argument('--device', + default='cuda:0', + help='device used for inference') + parser.add_argument('--score-thr', + default=0.1, + type=float, + help='confidence score threshold for predictions.') + parser.add_argument('--out', type=str, help='output video file') + args = parser.parse_args() + return args + + +def inference_detector(model, image, texts, test_pipeline, score_thr=0.3): + data_info = dict(img_id=0, img=image, texts=texts) + data_info = test_pipeline(data_info) + data_batch = dict(inputs=data_info['inputs'].unsqueeze(0), + data_samples=[data_info['data_samples']]) + + with torch.no_grad(): + output = model.test_step(data_batch)[0] + pred_instances = output.pred_instances + pred_instances = pred_instances[pred_instances.scores.float() > + score_thr] + output.pred_instances = pred_instances + return output + + +def main(): + args = parse_args() + + model = init_detector(args.config, args.checkpoint, device=args.device) + + # build test pipeline + model.cfg.test_dataloader.dataset.pipeline[ + 0].type = 'mmdet.LoadImageFromNDArray' + test_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline) + + if args.text.endswith('.txt'): + with open(args.text) as f: + lines = f.readlines() + texts = [[t.rstrip('\r\n')] for t in lines] + [[' ']] + else: + texts = [[t.strip()] for t in args.text.split(',')] + [[' ']] + + # reparameterize texts + model.reparameterize(texts) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + # the dataset_meta is loaded from the checkpoint and + # then pass to the model in init_detector + visualizer.dataset_meta = model.dataset_meta + + video_reader = mmcv.VideoReader(args.video) + video_writer = None + if args.out: + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + video_writer = cv2.VideoWriter( + args.out, fourcc, video_reader.fps, + (video_reader.width, video_reader.height)) + + for frame in track_iter_progress(video_reader): + result = inference_detector(model, + frame, + texts, + test_pipeline, + score_thr=args.score_thr) + visualizer.add_datasample(name='video', + image=frame, + data_sample=result, + draw_gt=False, + show=False, + pred_score_thr=args.score_thr) + frame = visualizer.get_image() + + if args.out: + video_writer.write(frame) + + if video_writer: + video_writer.release() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/deploy/__init__.py b/models/YOLO-World/deploy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/YOLO-World/deploy/easydeploy/README.md b/models/YOLO-World/deploy/easydeploy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1816e7ed96ee34209c56af4a22eda5f1eb7e499b --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/README.md @@ -0,0 +1,11 @@ +# MMYOLO Model Easy-Deployment + +## Introduction + +This project is developed for easily converting your MMYOLO models to other inference backends without the need of MMDeploy, which reduces the cost of both time and effort on getting familiar with MMDeploy. + +Currently we support converting to `ONNX` and `TensorRT` formats, other inference backends such `ncnn` will be added to this project as well. + +## Supported Backends + +- [Model Convert](docs/model_convert.md) diff --git a/models/YOLO-World/deploy/easydeploy/README_zh-CN.md b/models/YOLO-World/deploy/easydeploy/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..4c6bc0cf4ef91edeced04bdf15af08ae1f6f0dcd --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/README_zh-CN.md @@ -0,0 +1,11 @@ +# MMYOLO 模型转换 + +## 介绍 + +本项目作为 MMYOLO 的部署 project 单独存在,意图剥离 MMDeploy 当前的体系,独自支持用户完成模型训练后的转换和部署功能,使用户的学习和工程成本下降。 + +当前支持对 ONNX 格式和 TensorRT 格式的转换,后续对其他推理平台也会支持起来。 + +## 转换教程 + +- [Model Convert](docs/model_convert.md) diff --git a/models/YOLO-World/deploy/easydeploy/backbone/__init__.py b/models/YOLO-World/deploy/easydeploy/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dc167f8515c66a30d884ed9655a11d45e21481c0 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/backbone/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .common import DeployC2f +from .focus import DeployFocus, GConvFocus, NcnnFocus + +__all__ = ['DeployFocus', 'NcnnFocus', 'GConvFocus', 'DeployC2f'] diff --git a/models/YOLO-World/deploy/easydeploy/backbone/common.py b/models/YOLO-World/deploy/easydeploy/backbone/common.py new file mode 100644 index 0000000000000000000000000000000000000000..617875bd979a5b9150e476544090777118087a0b --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/backbone/common.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn +from torch import Tensor + + +class DeployC2f(nn.Module): + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, x: Tensor) -> Tensor: + x_main = self.main_conv(x) + x_main = [x_main, x_main[:, self.mid_channels:, ...]] + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.pop(1) + return self.final_conv(torch.cat(x_main, 1)) diff --git a/models/YOLO-World/deploy/easydeploy/backbone/focus.py b/models/YOLO-World/deploy/easydeploy/backbone/focus.py new file mode 100644 index 0000000000000000000000000000000000000000..2a19afcca1d9c4e27109daeebd83907cd9b7b284 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/backbone/focus.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class DeployFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + batch_size, channel, height, width = x.shape + x = x.reshape(batch_size, channel, -1, 2, width) + x = x.reshape(batch_size, channel, x.shape[2], 2, -1, 2) + half_h = x.shape[2] + half_w = x.shape[4] + x = x.permute(0, 5, 3, 1, 2, 4) + x = x.reshape(batch_size, channel * 4, half_h, half_w) + + return self.conv(x) + + +class NcnnFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + batch_size, c, h, w = x.shape + assert h % 2 == 0 and w % 2 == 0, f'focus for yolox needs even feature\ + height and width, got {(h, w)}.' + + x = x.reshape(batch_size, c * h, 1, w) + _b, _c, _h, _w = x.shape + g = _c // 2 + # fuse to ncnn's shufflechannel + x = x.view(_b, g, 2, _h, _w) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(_b, -1, _h, _w) + + x = x.reshape(_b, c * h * w, 1, 1) + + _b, _c, _h, _w = x.shape + g = _c // 2 + # fuse to ncnn's shufflechannel + x = x.view(_b, g, 2, _h, _w) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(_b, -1, _h, _w) + + x = x.reshape(_b, c * 4, h // 2, w // 2) + + return self.conv(x) + + +class GConvFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + device = next(orin_Focus.parameters()).device + self.weight1 = torch.tensor([[1., 0], [0, 0]]).expand(3, 1, 2, + 2).to(device) + self.weight2 = torch.tensor([[0, 0], [1., 0]]).expand(3, 1, 2, + 2).to(device) + self.weight3 = torch.tensor([[0, 1.], [0, 0]]).expand(3, 1, 2, + 2).to(device) + self.weight4 = torch.tensor([[0, 0], [0, 1.]]).expand(3, 1, 2, + 2).to(device) + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + conv1 = F.conv2d(x, self.weight1, stride=2, groups=3) + conv2 = F.conv2d(x, self.weight2, stride=2, groups=3) + conv3 = F.conv2d(x, self.weight3, stride=2, groups=3) + conv4 = F.conv2d(x, self.weight4, stride=2, groups=3) + return self.conv(torch.cat([conv1, conv2, conv3, conv4], dim=1)) diff --git a/models/YOLO-World/deploy/easydeploy/bbox_code/__init__.py b/models/YOLO-World/deploy/easydeploy/bbox_code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b85a815536a5749a15f0ad6aab2b028eb6a3fe0a --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/bbox_code/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_coder import (rtmdet_bbox_decoder, yolov5_bbox_decoder, + yolox_bbox_decoder) + +__all__ = ['yolov5_bbox_decoder', 'rtmdet_bbox_decoder', 'yolox_bbox_decoder'] diff --git a/models/YOLO-World/deploy/easydeploy/bbox_code/bbox_coder.py b/models/YOLO-World/deploy/easydeploy/bbox_code/bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..6483cf8b0328aff3d61f1fa0788337ab536d347d --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/bbox_code/bbox_coder.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from torch import Tensor + + +def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Tensor) -> Tensor: + bbox_preds = bbox_preds.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (bbox_preds[..., 2] * 2)**2 * w + h_pred = (bbox_preds[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1) + + return decoded_bboxes + + +def rtmdet_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Optional[Tensor]) -> Tensor: + stride = stride[None, :, None] + bbox_preds *= stride + tl_x = (priors[..., 0] - bbox_preds[..., 0]) + tl_y = (priors[..., 1] - bbox_preds[..., 1]) + br_x = (priors[..., 0] + bbox_preds[..., 2]) + br_y = (priors[..., 1] + bbox_preds[..., 3]) + decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) + return decoded_bboxes + + +def yolox_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Optional[Tensor]) -> Tensor: + stride = stride[None, :, None] + xys = (bbox_preds[..., :2] * stride) + priors + whs = bbox_preds[..., 2:].exp() * stride + decoded_bboxes = torch.cat([xys, whs], -1) + return decoded_bboxes diff --git a/models/YOLO-World/deploy/easydeploy/deepstream/CMakeLists.txt b/models/YOLO-World/deploy/easydeploy/deepstream/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f640bea13bacfc0f6cc2f33e598f65cf5ce0922e --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/deepstream/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 2.8.12) + +set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 72 75 86) +set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) + +project(nvdsparsebbox_mmyolo LANGUAGES CXX) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O3 -g -Wall -Werror -shared -fPIC") +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_BUILD_TYPE Release) +option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) + +# CUDA +find_package(CUDA REQUIRED) + +# TensorRT +set(TensorRT_INCLUDE_DIRS "/usr/include/x86_64-linux-gnu" CACHE STRING "TensorRT headers path") +set(TensorRT_LIBRARIES "/usr/lib/x86_64-linux-gnu" CACHE STRING "TensorRT libs path") + +# DeepStream +set(DEEPSTREAM "/opt/nvidia/deepstream/deepstream" CACHE STRING "DeepStream root path") +set(DS_LIBRARIES ${DEEPSTREAM}/lib) +set(DS_INCLUDE_DIRS ${DEEPSTREAM}/sources/includes) + +include_directories( + ${CUDA_INCLUDE_DIRS} + ${TensorRT_INCLUDE_DIRS} + ${DS_INCLUDE_DIRS}) + +add_library( + ${PROJECT_NAME} + SHARED + custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp) + +target_link_libraries(${PROJECT_NAME} PRIVATE nvinfer nvinfer_plugin) diff --git a/models/YOLO-World/deploy/easydeploy/deepstream/README.md b/models/YOLO-World/deploy/easydeploy/deepstream/README.md new file mode 100644 index 0000000000000000000000000000000000000000..111f3765e41d558b64097d8a25585bd9c14acf4f --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/deepstream/README.md @@ -0,0 +1,48 @@ +# Inference MMYOLO Models with DeepStream + +This project demonstrates how to inference MMYOLO models with customized parsers in [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk). + +## Pre-requisites + +### 1. Install Nvidia Driver and CUDA + +First, please follow the official documents and instructions to install dedicated Nvidia graphic driver and CUDA matched to your gpu and target Nvidia AIoT devices. + +### 2. Install DeepStream SDK + +Second, please follow the official instruction to download and install DeepStream SDK. Currently stable version of DeepStream is v6.2. + +### 3. Generate TensorRT Engine + +As DeepStream builds on top of several NVIDIA libraries, you need to first convert your trained MMYOLO models to TensorRT engine files. We strongly recommend you to try the supported TensorRT deployment solution in [EasyDeploy](../../easydeploy/). + +## Build and Run + +Please make sure that your converted TensorRT engine is already located in the `deepstream` folder as the config shows. Create your own model config files and change the `config-file` parameter in [deepstream_app_config.txt](deepstream_app_config.txt) to the model you want to run with. + +```bash +mkdir build && cd build +cmake .. +make -j$(nproc) && make install +``` + +Then you can run the inference with this command. + +```bash +deepstream-app -c deepstream_app_config.txt +``` + +## Code Structure + +```bash +├── deepstream +│ ├── configs # config file for MMYOLO models +│ │ └── config_infer_rtmdet.txt +│ ├── custom_mmyolo_bbox_parser # customized parser for MMYOLO models to DeepStream formats +│ │ └── nvdsparsebbox_mmyolo.cpp +| ├── CMakeLists.txt +│ ├── coco_labels.txt # labels for coco detection +│ ├── deepstream_app_config.txt # deepStream reference app configs for MMYOLO models +│ ├── README_zh-CN.md +│ └── README.md +``` diff --git a/models/YOLO-World/deploy/easydeploy/deepstream/README_zh-CN.md b/models/YOLO-World/deploy/easydeploy/deepstream/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..13a85d5bc90159c3ff9f1a32e93d01e82ed2faa4 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/deepstream/README_zh-CN.md @@ -0,0 +1,48 @@ +# 使用 DeepStream SDK 推理 MMYOLO 模型 + +本项目演示了如何使用 [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk) 配合改写的 parser 来推理 MMYOLO 的模型。 + +## 预先准备 + +### 1. 安装 Nidia 驱动和 CUDA + +首先请根据当前的显卡驱动和目标使用设备的驱动完成显卡驱动和 CUDA 的安装。 + +### 2. 安装 DeepStream SDK + +目前 DeepStream SDK 稳定版本已经更新到 v6.2,官方推荐使用这个版本。 + +### 3. 将 MMYOLO 模型转换为 TensorRT Engine + +推荐使用 EasyDeploy 中的 TensorRT 方案完成目标模型的转换部署,具体可参考 [此文档](../../easydeploy/docs/model_convert.md) 。 + +## 编译使用 + +当前项目使用的是 MMYOLO 的 rtmdet 模型,若想使用其他的模型,请参照目录下的配置文件进行改写。然后将转换完的 TensorRT engine 放在当前目录下并执行如下命令: + +```bash +mkdir build && cd build +cmake .. +make -j$(nproc) && make install +``` + +完成编译后可使用如下命令进行推理: + +```bash +deepstream-app -c deepstream_app_config.txt +``` + +## 项目代码结构 + +```bash +├── deepstream +│ ├── configs # MMYOLO 模型对应的 DeepStream 配置 +│ │ └── config_infer_rtmdet.txt +│ ├── custom_mmyolo_bbox_parser # 适配 DeepStream formats 的 parser +│ │ └── nvdsparsebbox_mmyolo.cpp +| ├── CMakeLists.txt +│ ├── coco_labels.txt # coco labels +│ ├── deepstream_app_config.txt # DeepStream app 配置 +│ ├── README_zh-CN.md +│ └── README.md +``` diff --git a/models/YOLO-World/deploy/easydeploy/deepstream/coco_labels.txt b/models/YOLO-World/deploy/easydeploy/deepstream/coco_labels.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca76c80b5b2cd0b25047f75736656cfebc9da7aa --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/deepstream/coco_labels.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/models/YOLO-World/deploy/easydeploy/deepstream/configs/config_infer_rtmdet.txt b/models/YOLO-World/deploy/easydeploy/deepstream/configs/config_infer_rtmdet.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1e5efd2a3810730144e037ee96dfbd36124b0e6 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/deepstream/configs/config_infer_rtmdet.txt @@ -0,0 +1,22 @@ +[property] +gpu-id=0 +net-scale-factor=0.01735207357279195 +offsets=57.375;57.12;58.395 +model-color-format=1 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/models/YOLO-World/deploy/easydeploy/deepstream/configs/config_infer_yolov5.txt b/models/YOLO-World/deploy/easydeploy/deepstream/configs/config_infer_yolov5.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7d6429cacd0a6050821e5b2a41317478f5119 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/deepstream/configs/config_infer_yolov5.txt @@ -0,0 +1,21 @@ +[property] +gpu-id=0 +net-scale-factor=0.0039215697906911373 +model-color-format=0 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/models/YOLO-World/deploy/easydeploy/deepstream/configs/config_infer_yolov8.txt b/models/YOLO-World/deploy/easydeploy/deepstream/configs/config_infer_yolov8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7d6429cacd0a6050821e5b2a41317478f5119 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/deepstream/configs/config_infer_yolov8.txt @@ -0,0 +1,21 @@ +[property] +gpu-id=0 +net-scale-factor=0.0039215697906911373 +model-color-format=0 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/models/YOLO-World/deploy/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp b/models/YOLO-World/deploy/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eb780856cbd2b289cdf9dc8518438f946a2ab548 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp @@ -0,0 +1,118 @@ +#include "nvdsinfer_custom_impl.h" +#include +#include + +/** + * Function expected by DeepStream for decoding the MMYOLO output. + * + * C-linkage [extern "C"] was written to prevent name-mangling. This function must return true after + * adding all bounding boxes to the objectList vector. + * + * @param [outputLayersInfo] std::vector of NvDsInferLayerInfo objects with information about the output layer. + * @param [networkInfo] NvDsInferNetworkInfo object with information about the MMYOLO network. + * @param [detectionParams] NvDsInferParseDetectionParams with information about some config params. + * @param [objectList] std::vector of NvDsInferParseObjectInfo objects to which bounding box information must + * be stored. + * + * @return true + */ + +// This is just the function prototype. The definition is written at the end of the file. +extern "C" bool NvDsInferParseCustomMMYOLO( + std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList); + +static __inline__ float clamp(float& val, float min, float max) +{ + return val > min ? (val < max ? val : max) : min; +} + +static std::vector decodeMMYoloTensor( + const int* num_dets, + const float* bboxes, + const float* scores, + const int* labels, + const float& conf_thres, + const unsigned int& img_w, + const unsigned int& img_h +) +{ + std::vector bboxInfo; + size_t nums = num_dets[0]; + for (size_t i = 0; i < nums; i++) + { + float score = scores[i]; + if (score < conf_thres)continue; + float x0 = (bboxes[i * 4]); + float y0 = (bboxes[i * 4 + 1]); + float x1 = (bboxes[i * 4 + 2]); + float y1 = (bboxes[i * 4 + 3]); + x0 = clamp(x0, 0.f, img_w); + y0 = clamp(y0, 0.f, img_h); + x1 = clamp(x1, 0.f, img_w); + y1 = clamp(y1, 0.f, img_h); + NvDsInferParseObjectInfo obj; + obj.left = x0; + obj.top = y0; + obj.width = x1 - x0; + obj.height = y1 - y0; + obj.detectionConfidence = score; + obj.classId = labels[i]; + bboxInfo.push_back(obj); + } + + return bboxInfo; +} + +/* C-linkage to prevent name-mangling */ +extern "C" bool NvDsInferParseCustomMMYOLO( + std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList) +{ + +// Some assertions and error checking. + if (outputLayersInfo.empty() || outputLayersInfo.size() != 4) + { + std::cerr << "Could not find output layer in bbox parsing" << std::endl; + return false; + } + +// Score threshold of bboxes. + const float conf_thres = detectionParams.perClassThreshold[0]; + +// Obtaining the output layer. + const NvDsInferLayerInfo& num_dets = outputLayersInfo[0]; + const NvDsInferLayerInfo& bboxes = outputLayersInfo[1]; + const NvDsInferLayerInfo& scores = outputLayersInfo[2]; + const NvDsInferLayerInfo& labels = outputLayersInfo[3]; + +// num_dets(int) bboxes(float) scores(float) labels(int) + assert (num_dets.dims.numDims == 2); + assert (bboxes.dims.numDims == 3); + assert (scores.dims.numDims == 2); + assert (labels.dims.numDims == 2); + + +// Decoding the output tensor of MMYOLO to the NvDsInferParseObjectInfo format. + std::vector objects = + decodeMMYoloTensor( + (const int*)(num_dets.buffer), + (const float*)(bboxes.buffer), + (const float*)(scores.buffer), + (const int*)(labels.buffer), + conf_thres, + networkInfo.width, + networkInfo.height + ); + + objectList.clear(); + objectList = objects; + return true; +} + +/* Check that the custom function has been defined correctly */ +CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomMMYOLO); diff --git a/models/YOLO-World/deploy/easydeploy/deepstream/deepstream_app_config.txt b/models/YOLO-World/deploy/easydeploy/deepstream/deepstream_app_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..331776897a5e9109b9007ed1b7974f128287c4fc --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/deepstream/deepstream_app_config.txt @@ -0,0 +1,62 @@ +[application] +enable-perf-measurement=1 +perf-measurement-interval-sec=5 + +[tiled-display] +enable=1 +rows=1 +columns=1 +width=1280 +height=720 +gpu-id=0 +nvbuf-memory-type=0 + +[source0] +enable=1 +type=3 +uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4 +num-sources=1 +gpu-id=0 +cudadec-memtype=0 + +[sink0] +enable=1 +type=2 +sync=0 +gpu-id=0 +nvbuf-memory-type=0 + +[osd] +enable=1 +gpu-id=0 +border-width=5 +text-size=15 +text-color=1;1;1;1; +text-bg-color=0.3;0.3;0.3;1 +font=Serif +show-clock=0 +clock-x-offset=800 +clock-y-offset=820 +clock-text-size=12 +clock-color=1;0;0;0 +nvbuf-memory-type=0 + +[streammux] +gpu-id=0 +live-source=0 +batch-size=1 +batched-push-timeout=40000 +width=1920 +height=1080 +enable-padding=0 +nvbuf-memory-type=0 + +[primary-gie] +enable=1 +gpu-id=0 +gie-unique-id=1 +nvbuf-memory-type=0 +config-file=configs/config_infer_rtmdet.txt + +[tests] +file-loop=0 diff --git a/models/YOLO-World/deploy/easydeploy/docs/model_convert.md b/models/YOLO-World/deploy/easydeploy/docs/model_convert.md new file mode 100644 index 0000000000000000000000000000000000000000..9af62599dd1b56648680fc315ca88c35c7b31cb9 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/docs/model_convert.md @@ -0,0 +1,156 @@ +# MMYOLO 模型 ONNX 转换 + +## 1. 导出后端支持的 ONNX + +## 环境依赖 + +- [onnx](https://github.com/onnx/onnx) + + ```shell + pip install onnx + ``` + + [onnx-simplifier](https://github.com/daquexian/onnx-simplifier) (可选,用于简化模型) + + ```shell + pip install onnx-simplifier + ``` + +\*\*\* 请确保您在 `MMYOLO` 根目录下运行相关脚本,避免无法找到相关依赖包。\*\*\* + +## 使用方法 + +[模型导出脚本](./projects/easydeploy/tools/export_onnx.py)用于将 `MMYOLO` 模型转换为 `onnx` 。 + +### 参数介绍: + +- `config` : 构建模型使用的配置文件,如 [`yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`](./configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) 。 +- `checkpoint` : 训练得到的权重文件,如 `yolov5s.pth` 。 +- `--work-dir` : 转换后的模型保存路径。 +- `--img-size`: 转换模型时输入的尺寸,如 `640 640`。 +- `--batch-size`: 转换后的模型输入 `batch size` 。 +- `--device`: 转换模型使用的设备,默认为 `cuda:0`。 +- `--simplify`: 是否简化导出的 `onnx` 模型,需要安装 [onnx-simplifier](https://github.com/daquexian/onnx-simplifier),默认关闭。 +- `--opset`: 指定导出 `onnx` 的 `opset`,默认为 `11` 。 +- `--backend`: 指定导出 `onnx` 用于的后端名称,`ONNXRuntime`: `onnxruntime`, `TensorRT8`: `tensorrt8`, `TensorRT7`: `tensorrt7`,默认为`onnxruntime`即 `ONNXRuntime`。 +- `--pre-topk`: 指定导出 `onnx` 的后处理筛选候选框个数阈值,默认为 `1000`。 +- `--keep-topk`: 指定导出 `onnx` 的非极大值抑制输出的候选框个数阈值,默认为 `100`。 +- `--iou-threshold`: 非极大值抑制中过滤重复候选框的 `iou` 阈值,默认为 `0.65`。 +- `--score-threshold`: 非极大值抑制中过滤候选框得分的阈值,默认为 `0.25`。 +- `--model-only`: 指定仅导出模型 backbone + neck, 不包含后处理,默认关闭。 + +例子: + +```shell +python ./projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5s.pth \ + --work-dir work_dir \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --backend 1 \ + --pre-topk 1000 \ + --keep-topk 100 \ + --iou-threshold 0.65 \ + --score-threshold 0.25 +``` + +然后利用后端支持的工具如 `TensorRT` 读取 `onnx` 再次转换为后端支持的模型格式如 `.engine/.plan` 等。 + +`MMYOLO` 目前支持 `TensorRT8`, `TensorRT7`, `ONNXRuntime` 后端的端到端模型转换,目前仅支持静态 shape 模型的导出和转换,动态 batch 或动态长宽的模型端到端转换会在未来继续支持。 + +端到端转换得到的 `onnx` 模型输入输出如图: + +
+ +
+ +输入名: `images`, 尺寸 640x640 + +输出名: `num_dets`, 尺寸 1x1,表示检测目标数量。 + +输出名: `boxes`, 尺寸 1x100x4,表示检测框的坐标,格式为 `x1y1x2y1`。 + +输出名: `scores`, 尺寸 1x100,表示检测框的分数。 + +输出名: `labels`, 尺寸 1x100,表示检测框的类别 id。 + +可以利用 `num_dets` 中的个数对 `boxes`, `scores`, `labels` 进行截断,从 100 个检测结果中抽取前 `num_dets` 个目标作为最终检测结果。 + +## 2. 仅导出模型 Backbone + Neck + +当您需要部署在非 `TensorRT`, `ONNXRuntime` 等支持端到端部署的平台时,您可以考虑使用`--model-only` 参数并且不要传递 `--backend` 参数,您将会导出仅包含 `Backbone` + `neck` 的模型,模型的部分输出如图: + +
+ +
+ +这种导出方式获取的 `ONNX` 模型具有如下优点: + +- 算子简单,一般而言只包含 `Conv`,激活函数等简单算子,几乎不存在无法正确导出的情况,对于嵌入式部署更加友好。 +- 方便不同算法之间对比速度性能,由于不同的算法后处理不同,仅对比 `backbone` + `Neck` 的速度更加公平。 + +也有如下缺点: + +- 后处理逻辑需要单独完成,会有额外的 `decode` + `nms` 的操作需要实现。 +- 与 `TensorRT` 相比,由于 `TensorRT` 可以利用多核优势并行进行后处理,使用 `--model-only` 方式导出的模型性能会差很多。 + +### 使用方法 + +```shell +python ./projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5s.pth \ + --work-dir work_dir \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --model-only +``` + +## 使用 `model-only` 导出的 ONNX 进行推理 + +[模型推理脚本](./projects/easydeploy/examples/main_onnxruntime.py)用于推理导出的 `ONNX` 模型,需要安装基础依赖环境: + +[`onnxruntime`](https://github.com/microsoft/onnxruntime) 和 [`opencv-python`](https://github.com/opencv/opencv-python) + +```shell +pip install onnxruntime +pip install opencv-python==4.7.0.72 # 建议使用最新的 opencv +``` + +### 参数介绍: + +- `img` : 待检测的图片路径或图片文件夹路径。 +- `onnx` : 导出的 `model-only` ONNX 模型。 +- `--type` : 模型名称,目前支持 `yolov5`, `yolox`, `yolov6`, `ppyoloe`, `ppyoloep`, `yolov7`, `rtmdet`, `yolov8`。 +- `--img-size`: 转换模型时输入的尺寸,如 `640 640`。 +- `--out-dir`: 保存检测结果的路径 。 +- `--show`: 是否可视化检测结果。 +- `--score-thr`: 模型检测后处理的置信度分数 。 +- `--iou-thr`: 模型检测后处理的 IOU 分数 。 + +## 使用方法 + +```shell +cd ./projects/easydeploy/examples +python main_onnxruntime.py \ + "image_path_to_detect" \ + yolov5_s_model-only.onnx \ + --out-dir work_dir \ + --img-size 640 640 \ + --show \ + --score-thr 0.3 \ + --iou-thr 0.7 +``` + +*注意!!!* + +当您使用自定义数据集训练得到的模型时,请修改 [`config.py`](./projects/easydeploy/examples/config.py) 中 `CLASS_NAMES` 和 `CLASS_COLORS`,如果是 `yolov5` 或者 `yolov7` 基于 `anchor` 的模型请同时修改 `YOLOv5_ANCHORS` 和 `YOLOv7_ANCHORS`。 + +[`numpy_coder.py`](./projects/easydeploy/examples/numpy_coder.py) 是目前所有算法仅使用 `numpy` 实现的 `decoder`,如果您对性能有较高的要求,可以参照相关代码改写为 `c/c++`。 diff --git a/models/YOLO-World/deploy/easydeploy/examples/config.py b/models/YOLO-World/deploy/easydeploy/examples/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4a85ff34273c22a356c9d6a3eaeb048b637b5f40 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/examples/config.py @@ -0,0 +1,64 @@ +from enum import Enum + + +class TASK_TYPE(Enum): + DET = 'det' + SEG = 'seg' + POSE = 'pose' + + +class ModelType(Enum): + YOLOV5 = 'yolov5' + YOLOX = 'yolox' + PPYOLOE = 'ppyoloe' + PPYOLOEP = 'ppyoloep' + YOLOV6 = 'yolov6' + YOLOV7 = 'yolov7' + RTMDET = 'rtmdet' + YOLOV8 = 'yolov8' + + +CLASS_NAMES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', + 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', + 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', + 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', + 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', + 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', + 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', + 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', + 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', + 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', + 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') + +CLASS_COLORS = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), + (106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70), + (0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0), + (175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255), + (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157), + (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118), + (255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182), + (0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255), + (78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255), + (134, 134, 103), (145, 148, 174), (255, 208, 186), + (197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255), + (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105), + (166, 196, 102), (208, 195, 210), (255, 109, 65), + (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0), + (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161), + (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120), + (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133), + (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62), + (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45), + (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1), + (246, 0, 122), (191, 162, 208)] + +YOLOv5_ANCHORS = [[(10, 13), (16, 30), (33, 23)], + [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]] + +YOLOv7_ANCHORS = [[(12, 16), (19, 36), (40, 28)], + [(36, 75), (76, 55), (72, 146)], + [(142, 110), (192, 243), (459, 401)]] diff --git a/models/YOLO-World/deploy/easydeploy/examples/cv2_nms.py b/models/YOLO-World/deploy/easydeploy/examples/cv2_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..79e376356b75339c796aeeb280cd8cdb52db8518 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/examples/cv2_nms.py @@ -0,0 +1,36 @@ +from typing import List, Tuple, Union + +import cv2 +from numpy import ndarray + +MAJOR, MINOR = map(int, cv2.__version__.split('.')[:2]) +assert MAJOR == 4 + + +def non_max_suppression(boxes: Union[List[ndarray], Tuple[ndarray]], + scores: Union[List[float], Tuple[float]], + labels: Union[List[int], Tuple[int]], + conf_thres: float = 0.25, + iou_thres: float = 0.65) -> Tuple[List, List, List]: + if MINOR >= 7: + indices = cv2.dnn.NMSBoxesBatched(boxes, scores, labels, conf_thres, + iou_thres) + elif MINOR == 6: + indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres) + else: + indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, + iou_thres).flatten() + + nmsd_boxes = [] + nmsd_scores = [] + nmsd_labels = [] + for idx in indices: + box = boxes[idx] + # x0y0wh -> x0y0x1y1 + box[2:] = box[:2] + box[2:] + score = scores[idx] + label = labels[idx] + nmsd_boxes.append(box) + nmsd_scores.append(score) + nmsd_labels.append(label) + return nmsd_boxes, nmsd_scores, nmsd_labels diff --git a/models/YOLO-World/deploy/easydeploy/examples/main_onnxruntime.py b/models/YOLO-World/deploy/easydeploy/examples/main_onnxruntime.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0ad1b0f10ed6cbea8c8b3c0c5010ec7a760cb5 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/examples/main_onnxruntime.py @@ -0,0 +1,110 @@ +import math +import sys +from argparse import ArgumentParser +from pathlib import Path + +import cv2 +import onnxruntime +from config import (CLASS_COLORS, CLASS_NAMES, ModelType, YOLOv5_ANCHORS, + YOLOv7_ANCHORS) +from cv2_nms import non_max_suppression +from numpy_coder import Decoder +from preprocess import Preprocess +from tqdm import tqdm + +# Add __FILE__ to sys.path +sys.path.append(str(Path(__file__).resolve().parents[0])) + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def path_to_list(path: str): + path = Path(path) + if path.is_file() and path.suffix in IMG_EXTENSIONS: + res_list = [str(path.absolute())] + elif path.is_dir(): + res_list = [ + str(p.absolute()) for p in path.iterdir() + if p.suffix in IMG_EXTENSIONS + ] + else: + raise RuntimeError + return res_list + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('onnx', type=str, help='Onnx file') + parser.add_argument('--type', type=str, help='Model type') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument( + '--out-dir', default='./output', type=str, help='Path to output file') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--iou-thr', type=float, default=0.7, help='Bbox iou threshold') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + out_dir = Path(args.out_dir) + model_type = ModelType(args.type.lower()) + + if not args.show: + out_dir.mkdir(parents=True, exist_ok=True) + + files = path_to_list(args.img) + session = onnxruntime.InferenceSession( + args.onnx, providers=['CPUExecutionProvider']) + preprocessor = Preprocess(model_type) + decoder = Decoder(model_type, model_only=True) + if model_type == ModelType.YOLOV5: + anchors = YOLOv5_ANCHORS + elif model_type == ModelType.YOLOV7: + anchors = YOLOv7_ANCHORS + else: + anchors = None + + for file in tqdm(files): + image = cv2.imread(file) + image_h, image_w = image.shape[:2] + img, (ratio_w, ratio_h) = preprocessor(image, args.img_size) + features = session.run(None, {'images': img}) + decoder_outputs = decoder( + features, + args.score_thr, + num_labels=len(CLASS_NAMES), + anchors=anchors) + nmsd_boxes, nmsd_scores, nmsd_labels = non_max_suppression( + *decoder_outputs, args.score_thr, args.iou_thr) + for box, score, label in zip(nmsd_boxes, nmsd_scores, nmsd_labels): + x0, y0, x1, y1 = box + x0 = math.floor(min(max(x0 / ratio_w, 1), image_w - 1)) + y0 = math.floor(min(max(y0 / ratio_h, 1), image_h - 1)) + x1 = math.ceil(min(max(x1 / ratio_w, 1), image_w - 1)) + y1 = math.ceil(min(max(y1 / ratio_h, 1), image_h - 1)) + cv2.rectangle(image, (x0, y0), (x1, y1), CLASS_COLORS[label], 2) + cv2.putText(image, f'{CLASS_NAMES[label]}: {score:.2f}', + (x0, y0 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, + (0, 255, 255), 2) + if args.show: + cv2.imshow('result', image) + cv2.waitKey(0) + else: + cv2.imwrite(f'{out_dir / Path(file).name}', image) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/deploy/easydeploy/examples/numpy_coder.py b/models/YOLO-World/deploy/easydeploy/examples/numpy_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..3011965597415b9b6b09fcfe950ea36702b51e57 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/examples/numpy_coder.py @@ -0,0 +1,309 @@ +from typing import List, Tuple, Union + +import numpy as np +from config import ModelType +from numpy import ndarray + + +def softmax(x: ndarray, axis: int = -1) -> ndarray: + e_x = np.exp(x - np.max(x, axis=axis, keepdims=True)) + y = e_x / e_x.sum(axis=axis, keepdims=True) + return y + + +def sigmoid(x: ndarray) -> ndarray: + return 1. / (1. + np.exp(-x)) + + +class Decoder: + + def __init__(self, model_type: ModelType, model_only: bool = False): + self.model_type = model_type + self.model_only = model_only + self.boxes_pro = [] + self.scores_pro = [] + self.labels_pro = [] + self.is_logging = False + + def __call__(self, + feats: Union[List, Tuple], + conf_thres: float, + num_labels: int = 80, + **kwargs) -> Tuple: + if not self.is_logging: + print('Only support decode in batch==1') + self.is_logging = True + self.boxes_pro.clear() + self.scores_pro.clear() + self.labels_pro.clear() + + if self.model_only: + # transpose channel to last dim for easy decoding + feats = [ + np.ascontiguousarray(feat[0].transpose(1, 2, 0)) + for feat in feats + ] + else: + # ax620a horizonX3 transpose channel to last dim by default + feats = [np.ascontiguousarray(feat) for feat in feats] + if self.model_type == ModelType.YOLOV5: + self.__yolov5_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOX: + self.__yolox_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type in (ModelType.PPYOLOE, ModelType.PPYOLOEP): + self.__ppyoloe_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV6: + self.__yolov6_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV7: + self.__yolov7_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.RTMDET: + self.__rtmdet_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV8: + self.__yolov8_decode(feats, conf_thres, num_labels, **kwargs) + else: + raise NotImplementedError + return self.boxes_pro, self.scores_pro, self.labels_pro + + def __yolov5_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + anchors: Union[List, Tuple] = kwargs.get( + 'anchors', + [[(10, 13), (16, 30), + (33, 23)], [(30, 61), (62, 45), + (59, 119)], [(116, 90), (156, 198), (373, 326)]]) + for i, feat in enumerate(feats): + stride = 8 << i + feat_h, feat_w, _ = feat.shape + anchor = anchors[i] + feat = sigmoid(feat) + feat = feat.reshape((feat_h, feat_w, len(anchor), -1)) + box_feat, conf_feat, score_feat = np.split(feat, [4, 5], -1) + + hIdx, wIdx, aIdx, _ = np.where(conf_feat > conf_thres) + + num_proposal = hIdx.size + if not num_proposal: + continue + + score_feat = score_feat[hIdx, wIdx, aIdx] * conf_feat[hIdx, wIdx, + aIdx] + boxes = box_feat[hIdx, wIdx, aIdx] + labels = score_feat.argmax(-1) + scores = score_feat.max(-1) + + indices = np.where(scores > conf_thres)[0] + if len(indices) == 0: + continue + + for idx in indices: + a_w, a_h = anchor[aIdx[idx]] + x, y, w, h = boxes[idx] + x = (x * 2.0 - 0.5 + wIdx[idx]) * stride + y = (y * 2.0 - 0.5 + hIdx[idx]) * stride + w = (w * 2.0)**2 * a_w + h = (h * 2.0)**2 * a_h + + x0 = x - w / 2 + y0 = y - h / 2 + + self.scores_pro.append(float(scores[idx])) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(labels[idx])) + + def __yolox_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat, conf_feat = np.split( + feat, [num_labels, num_labels + 4], -1) + conf_feat = sigmoid(conf_feat) + + hIdx, wIdx, _ = np.where(conf_feat > conf_thres) + + num_proposal = hIdx.size + if not num_proposal: + continue + + score_feat = sigmoid(score_feat[hIdx, wIdx]) * conf_feat[hIdx, + wIdx] + boxes = box_feat[hIdx, wIdx] + labels = score_feat.argmax(-1) + scores = score_feat.max(-1) + indices = np.where(scores > conf_thres)[0] + + if len(indices) == 0: + continue + + for idx in indices: + score = scores[idx] + label = labels[idx] + + x, y, w, h = boxes[idx] + + x = (x + wIdx[idx]) * stride + y = (y + hIdx[idx]) * stride + w = np.exp(w) * stride + h = np.exp(h) * stride + + x0 = x - w / 2 + y0 = y - h / 2 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __ppyoloe_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + reg_max: int = kwargs.get('reg_max', 17) + dfl = np.arange(0, reg_max, dtype=np.float32) + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx].reshape(num_proposal, 4, reg_max) + boxes = softmax(boxes, -1) @ dfl + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] + 0.5 - x0) * stride + y0 = (hIdx[k] + 0.5 - y0) * stride + x1 = (wIdx[k] + 0.5 + x1) * stride + y1 = (hIdx[k] + 0.5 + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov6_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx] + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] + 0.5 - x0) * stride + y0 = (hIdx[k] + 0.5 - y0) * stride + x1 = (wIdx[k] + 0.5 + x1) * stride + y1 = (hIdx[k] + 0.5 + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov7_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + anchors: Union[List, Tuple] = kwargs.get( + 'anchors', + [[(12, 16), (19, 36), + (40, 28)], [(36, 75), (76, 55), + (72, 146)], [(142, 110), (192, 243), (459, 401)]]) + self.__yolov5_decode(feats, conf_thres, num_labels, anchors=anchors) + + def __rtmdet_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx] + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] - x0) * stride + y0 = (hIdx[k] - y0) * stride + x1 = (wIdx[k] + x1) * stride + y1 = (hIdx[k] + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov8_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + self.__yolov6_decode(feats, conf_thres, num_labels) diff --git a/models/YOLO-World/deploy/easydeploy/examples/preprocess.py b/models/YOLO-World/deploy/easydeploy/examples/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..6b6fb563a16a7f40ef556b5a23f635ab4627fc4f --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/examples/preprocess.py @@ -0,0 +1,57 @@ +from typing import List, Tuple, Union + +import cv2 +import numpy as np +from config import ModelType +from numpy import ndarray + + +class Preprocess: + + def __init__(self, model_type: ModelType): + if model_type in (ModelType.YOLOV5, ModelType.YOLOV6, ModelType.YOLOV7, + ModelType.YOLOV8): + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([255, 255, 255], dtype=np.float32) + is_rgb = True + elif model_type == ModelType.YOLOX: + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([1, 1, 1], dtype=np.float32) + is_rgb = False + elif model_type == ModelType.PPYOLOE: + mean = np.array([123.675, 116.28, 103.53], dtype=np.float32) + std = np.array([58.395, 57.12, 57.375], dtype=np.float32) + is_rgb = True + + elif model_type == ModelType.PPYOLOEP: + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([255, 255, 255], dtype=np.float32) + is_rgb = True + elif model_type == ModelType.RTMDET: + mean = np.array([103.53, 116.28, 123.675], dtype=np.float32) + std = np.array([57.375, 57.12, 58.3955], dtype=np.float32) + is_rgb = False + else: + raise NotImplementedError + + self.mean = mean.reshape((3, 1, 1)) + self.std = std.reshape((3, 1, 1)) + self.is_rgb = is_rgb + + def __call__(self, + image: ndarray, + new_size: Union[List[int], Tuple[int]] = (640, 640), + **kwargs) -> Tuple[ndarray, Tuple[float, float]]: + # new_size: (height, width) + height, width = image.shape[:2] + ratio_h, ratio_w = new_size[0] / height, new_size[1] / width + image = cv2.resize( + image, (0, 0), + fx=ratio_w, + fy=ratio_h, + interpolation=cv2.INTER_LINEAR) + image = np.ascontiguousarray(image.transpose(2, 0, 1)) + image = image.astype(np.float32) + image -= self.mean + image /= self.std + return image[np.newaxis], (ratio_w, ratio_h) diff --git a/models/YOLO-World/deploy/easydeploy/examples/requirements.txt b/models/YOLO-World/deploy/easydeploy/examples/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b761189b52fc57e4231b37df0ff42bb44404c95 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/examples/requirements.txt @@ -0,0 +1,2 @@ +onnxruntime +opencv-python==4.7.0.72 diff --git a/models/YOLO-World/deploy/easydeploy/model/__init__.py b/models/YOLO-World/deploy/easydeploy/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..38af8bc322b0a8e0c870fac243a0af9c1dba7315 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/model/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backend import MMYOLOBackend +from .backendwrapper import ORTWrapper, TRTWrapper +from .model import DeployModel + +__all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper', 'MMYOLOBackend'] diff --git a/models/YOLO-World/deploy/easydeploy/model/backend.py b/models/YOLO-World/deploy/easydeploy/model/backend.py new file mode 100644 index 0000000000000000000000000000000000000000..64d6e3f020bcfd3c3cf7db5f5611a8f815df4cb1 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/model/backend.py @@ -0,0 +1,23 @@ +from enum import Enum + +import torch +import torch.nn.functional as F + + +class MMYOLOBackend(Enum): + AX620A = 'ax620a' + COREML = 'coreml' + HORIZONX3 = 'horizonx3' + NCNN = 'ncnn' + ONNXRUNTIME = 'onnxruntime' + OPENVINO = 'openvino' + PPLNN = 'pplnn' + RKNN = 'rknn' + TENSORRT8 = 'tensorrt8' + TENSORRT7 = 'tensorrt7' + TORCHSCRIPT = 'torchscript' + TVM = 'tvm' + + +def HSigmoid__forward(self, x: torch.Tensor) -> torch.Tensor: + return F.hardsigmoid(x, inplace=True) diff --git a/models/YOLO-World/deploy/easydeploy/model/backendwrapper.py b/models/YOLO-World/deploy/easydeploy/model/backendwrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..2997d84ea98b3f30973cf2335ab0eb4af4edaef5 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/model/backendwrapper.py @@ -0,0 +1,202 @@ +import warnings +from collections import namedtuple +from functools import partial +from pathlib import Path +from typing import List, Optional, Union + +import numpy as np +import onnxruntime + +try: + import tensorrt as trt +except Exception: + trt = None +import torch + +warnings.filterwarnings(action='ignore', category=DeprecationWarning) + + +class TRTWrapper(torch.nn.Module): + dtype_mapping = {} + + def __init__(self, weight: Union[str, Path], + device: Optional[torch.device]): + super().__init__() + weight = Path(weight) if isinstance(weight, str) else weight + assert weight.exists() and weight.suffix in ('.engine', '.plan') + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + self.weight = weight + self.device = device + self.stream = torch.cuda.Stream(device=device) + self.__update_mapping() + self.__init_engine() + self.__init_bindings() + + def __update_mapping(self): + self.dtype_mapping.update({ + trt.bool: torch.bool, + trt.int8: torch.int8, + trt.int32: torch.int32, + trt.float16: torch.float16, + trt.float32: torch.float32 + }) + + def __init_engine(self): + logger = trt.Logger(trt.Logger.ERROR) + self.log = partial(logger.log, trt.Logger.ERROR) + trt.init_libnvinfer_plugins(logger, namespace='') + self.logger = logger + with trt.Runtime(logger) as runtime: + model = runtime.deserialize_cuda_engine(self.weight.read_bytes()) + + context = model.create_execution_context() + + names = [model.get_binding_name(i) for i in range(model.num_bindings)] + + num_inputs, num_outputs = 0, 0 + + for i in range(model.num_bindings): + if model.binding_is_input(i): + num_inputs += 1 + else: + num_outputs += 1 + + self.is_dynamic = -1 in model.get_binding_shape(0) + + self.model = model + self.context = context + self.input_names = names[:num_inputs] + self.output_names = names[num_inputs:] + self.num_inputs = num_inputs + self.num_outputs = num_outputs + self.num_bindings = num_inputs + num_outputs + self.bindings: List[int] = [0] * self.num_bindings + + def __init_bindings(self): + Binding = namedtuple('Binding', ('name', 'dtype', 'shape')) + inputs_info = [] + outputs_info = [] + + for i, name in enumerate(self.input_names): + assert self.model.get_binding_name(i) == name + dtype = self.dtype_mapping[self.model.get_binding_dtype(i)] + shape = tuple(self.model.get_binding_shape(i)) + inputs_info.append(Binding(name, dtype, shape)) + + for i, name in enumerate(self.output_names): + i += self.num_inputs + assert self.model.get_binding_name(i) == name + dtype = self.dtype_mapping[self.model.get_binding_dtype(i)] + shape = tuple(self.model.get_binding_shape(i)) + outputs_info.append(Binding(name, dtype, shape)) + self.inputs_info = inputs_info + self.outputs_info = outputs_info + if not self.is_dynamic: + self.output_tensor = [ + torch.empty(o.shape, dtype=o.dtype, device=self.device) + for o in outputs_info + ] + + def forward(self, *inputs): + + assert len(inputs) == self.num_inputs + + contiguous_inputs: List[torch.Tensor] = [ + i.contiguous() for i in inputs + ] + + for i in range(self.num_inputs): + self.bindings[i] = contiguous_inputs[i].data_ptr() + if self.is_dynamic: + self.context.set_binding_shape( + i, tuple(contiguous_inputs[i].shape)) + + # create output tensors + outputs: List[torch.Tensor] = [] + + for i in range(self.num_outputs): + j = i + self.num_inputs + if self.is_dynamic: + shape = tuple(self.context.get_binding_shape(j)) + output = torch.empty( + size=shape, + dtype=self.output_dtypes[i], + device=self.device) + + else: + output = self.output_tensor[i] + outputs.append(output) + self.bindings[j] = output.data_ptr() + + self.context.execute_async_v2(self.bindings, self.stream.cuda_stream) + self.stream.synchronize() + + return tuple(outputs) + + +class ORTWrapper(torch.nn.Module): + + def __init__(self, weight: Union[str, Path], + device: Optional[torch.device]): + super().__init__() + weight = Path(weight) if isinstance(weight, str) else weight + assert weight.exists() and weight.suffix == '.onnx' + + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + self.weight = weight + self.device = device + self.__init_session() + self.__init_bindings() + + def __init_session(self): + providers = ['CPUExecutionProvider'] + if 'cuda' in self.device.type: + providers.insert(0, 'CUDAExecutionProvider') + + session = onnxruntime.InferenceSession( + str(self.weight), providers=providers) + self.session = session + + def __init_bindings(self): + Binding = namedtuple('Binding', ('name', 'dtype', 'shape')) + inputs_info = [] + outputs_info = [] + self.is_dynamic = False + for i, tensor in enumerate(self.session.get_inputs()): + if any(not isinstance(i, int) for i in tensor.shape): + self.is_dynamic = True + inputs_info.append( + Binding(tensor.name, tensor.type, tuple(tensor.shape))) + + for i, tensor in enumerate(self.session.get_outputs()): + outputs_info.append( + Binding(tensor.name, tensor.type, tuple(tensor.shape))) + self.inputs_info = inputs_info + self.outputs_info = outputs_info + self.num_inputs = len(inputs_info) + + def forward(self, *inputs): + + assert len(inputs) == self.num_inputs + + contiguous_inputs: List[np.ndarray] = [ + i.contiguous().cpu().numpy() for i in inputs + ] + + if not self.is_dynamic: + # make sure input shape is right for static input shape + for i in range(self.num_inputs): + assert contiguous_inputs[i].shape == self.inputs_info[i].shape + + outputs = self.session.run([o.name for o in self.outputs_info], { + j.name: contiguous_inputs[i] + for i, j in enumerate(self.inputs_info) + }) + + return tuple(torch.from_numpy(o).to(self.device) for o in outputs) diff --git a/models/YOLO-World/deploy/easydeploy/model/model.py b/models/YOLO-World/deploy/easydeploy/model/model.py new file mode 100644 index 0000000000000000000000000000000000000000..21cf50f7df059ebc7d1974754d290883c06f6a0e --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/model/model.py @@ -0,0 +1,217 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy +from functools import partial +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmdet.models.backbones.csp_darknet import Focus +from mmdet.models.layers import ChannelAttention +from mmengine.config import ConfigDict +from torch import Tensor + +from mmyolo.models import RepVGGBlock +from mmyolo.models.dense_heads import (PPYOLOEHead, RTMDetHead, YOLOv5Head, + YOLOv7Head, YOLOv8Head, YOLOXHead) +from mmyolo.models.layers import ImplicitA, ImplicitM +from ..backbone import DeployFocus, GConvFocus, NcnnFocus +from ..bbox_code import (rtmdet_bbox_decoder, yolov5_bbox_decoder, + yolox_bbox_decoder) +from ..nms import batched_nms, efficient_nms, onnx_nms +from .backend import MMYOLOBackend + + +class DeployModel(nn.Module): + transpose = False + + def __init__(self, + baseModel: nn.Module, + backend: MMYOLOBackend, + postprocess_cfg: Optional[ConfigDict] = None, + with_nms=True, + without_bbox_decoder=False): + super().__init__() + self.baseModel = baseModel + self.baseHead = baseModel.bbox_head + self.backend = backend + self.with_nms = with_nms + self.without_bbox_decoder = without_bbox_decoder + if postprocess_cfg is None: + self.with_postprocess = False + else: + self.with_postprocess = True + self.__init_sub_attributes() + self.detector_type = type(self.baseHead) + self.pre_top_k = postprocess_cfg.get('pre_top_k', 1000) + self.keep_top_k = postprocess_cfg.get('keep_top_k', 100) + self.iou_threshold = postprocess_cfg.get('iou_threshold', 0.65) + self.score_threshold = postprocess_cfg.get('score_threshold', 0.25) + self.__switch_deploy() + + def __init_sub_attributes(self): + self.bbox_decoder = self.baseHead.bbox_coder.decode + self.prior_generate = self.baseHead.prior_generator.grid_priors + self.num_base_priors = self.baseHead.num_base_priors + self.featmap_strides = self.baseHead.featmap_strides + self.num_classes = self.baseHead.num_classes + + def __switch_deploy(self): + headType = type(self.baseHead) + if not self.with_postprocess: + if headType in (YOLOv5Head, YOLOv7Head): + self.baseHead.head_module.forward_single = self.forward_single + elif headType in (PPYOLOEHead, YOLOv8Head): + self.baseHead.head_module.reg_max = 0 + + if self.backend in (MMYOLOBackend.HORIZONX3, MMYOLOBackend.NCNN, + MMYOLOBackend.TORCHSCRIPT): + self.transpose = True + for layer in self.baseModel.modules(): + if isinstance(layer, RepVGGBlock): + layer.switch_to_deploy() + elif isinstance(layer, ChannelAttention): + layer.global_avgpool.forward = self.forward_gvp + elif isinstance(layer, Focus): + # onnxruntime openvino tensorrt8 tensorrt7 + if self.backend in (MMYOLOBackend.ONNXRUNTIME, + MMYOLOBackend.OPENVINO, + MMYOLOBackend.TENSORRT8, + MMYOLOBackend.TENSORRT7): + self.baseModel.backbone.stem = DeployFocus(layer) + # ncnn + elif self.backend == MMYOLOBackend.NCNN: + self.baseModel.backbone.stem = NcnnFocus(layer) + # switch focus to group conv + else: + self.baseModel.backbone.stem = GConvFocus(layer) + + def pred_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + coeff_preds: Optional[List[Tensor]] = None, + proto_preds: Optional[List[Tensor]] = None, + **kwargs): + assert len(cls_scores) == len(bbox_preds) + dtype = cls_scores[0].dtype + device = cls_scores[0].device + + nms_func = self.select_nms() + if self.detector_type in (YOLOv5Head, YOLOv7Head): + bbox_decoder = yolov5_bbox_decoder + elif self.detector_type is RTMDetHead: + bbox_decoder = rtmdet_bbox_decoder + elif self.detector_type is YOLOXHead: + bbox_decoder = yolox_bbox_decoder + else: + bbox_decoder = self.bbox_decoder + print(bbox_decoder) + + num_imgs = cls_scores[0].shape[0] + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + mlvl_priors = self.prior_generate(featmap_sizes, + dtype=dtype, + device=device) + + flatten_priors = torch.cat(mlvl_priors) + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size[0] * featmap_size[1] * self.num_base_priors, ), + stride) for featmap_size, stride in zip( + featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + text_len = cls_scores[0].shape[1] + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, text_len) + for cls_score in cls_scores + ] + cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + + if objectnesses is not None: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1)) + + scores = cls_scores + bboxes = flatten_bbox_preds + if self.without_bbox_decoder: + return scores, bboxes + bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds, + flatten_stride) + + if self.with_nms: + return nms_func(bboxes, scores, self.keep_top_k, + self.iou_threshold, self.score_threshold, + self.pre_top_k, self.keep_top_k) + else: + return scores, bboxes + + def select_nms(self): + if self.backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO): + nms_func = onnx_nms + elif self.backend == MMYOLOBackend.TENSORRT8: + nms_func = efficient_nms + elif self.backend == MMYOLOBackend.TENSORRT7: + nms_func = batched_nms + else: + raise NotImplementedError + if type(self.baseHead) in (YOLOv5Head, YOLOv7Head, YOLOXHead): + nms_func = partial(nms_func, box_coding=1) + + return nms_func + + def forward(self, inputs: Tensor): + neck_outputs = self.baseModel(inputs) + if self.with_postprocess: + return self.pred_by_feat(*neck_outputs) + else: + outputs = [] + if self.transpose: + for feats in zip(*neck_outputs): + if self.backend in (MMYOLOBackend.NCNN, + MMYOLOBackend.TORCHSCRIPT): + outputs.append( + torch.cat( + [feat.permute(0, 2, 3, 1) for feat in feats], + -1)) + else: + outputs.append(torch.cat(feats, 1).permute(0, 2, 3, 1)) + else: + for feats in zip(*neck_outputs): + outputs.append(torch.cat(feats, 1)) + return tuple(outputs) + + @staticmethod + def forward_single(x: Tensor, convs: nn.Module) -> Tuple[Tensor]: + if isinstance(convs, nn.Sequential) and any( + type(m) in (ImplicitA, ImplicitM) for m in convs): + a, c, m = convs + aw = a.implicit.clone() + mw = m.implicit.clone() + c = deepcopy(c) + nw, cw, _, _ = c.weight.shape + na, ca, _, _ = aw.shape + nm, cm, _, _ = mw.shape + c.bias = nn.Parameter(c.bias + ( + c.weight.reshape(nw, cw) @ aw.reshape(ca, na)).squeeze(1)) + c.bias = nn.Parameter(c.bias * mw.reshape(cm)) + c.weight = nn.Parameter(c.weight * mw.transpose(0, 1)) + convs = c + feat = convs(x) + return (feat, ) + + @staticmethod + def forward_gvp(x: Tensor) -> Tensor: + return torch.mean(x, [2, 3], keepdim=True) diff --git a/models/YOLO-World/deploy/easydeploy/nms/__init__.py b/models/YOLO-World/deploy/easydeploy/nms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..59c5cdbd2b3b195125a14f473b825f616755fd6e --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/nms/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ort_nms import onnx_nms +from .trt_nms import batched_nms, efficient_nms + +__all__ = ['efficient_nms', 'batched_nms', 'onnx_nms'] diff --git a/models/YOLO-World/deploy/easydeploy/nms/ort_nms.py b/models/YOLO-World/deploy/easydeploy/nms/ort_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..597f3fb6f33c5bf182aa9c5ba4740e53168b005a --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/nms/ort_nms.py @@ -0,0 +1,215 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor +from torchvision.ops import batched_nms + +_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0], + [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]], + dtype=torch.float32) + + +def sort_nms_index(nms_index, scores, batch_size, keep_top_k=-1): + """ + first sort the nms_index by batch, and then sort by score in every image result, final apply keep_top_k strategy. In the process, we can also get the number of detections for each image: num_dets + """ + # first sort by batch index to make sure that the same batch index is together + device = nms_index.device + nms_index_indices = torch.argsort(nms_index[:, 0], dim=0).to(device) + nms_index = nms_index[nms_index_indices] + + scores = scores[nms_index[:, 0], nms_index[:, 1], nms_index[:, 2]] + batch_inds = nms_index[:, 0] + + # Get the number of detections for each image + num_dets = torch.bincount(batch_inds,minlength=batch_size).to(device) + # Calculate the sum from front to back + cumulative_sum = torch.cumsum(num_dets, dim=0).to(device) + # add initial value 0 + cumulative_sum = torch.cat((torch.tensor([0]).to(device), cumulative_sum)) + for i in range(len(num_dets)): + start = cumulative_sum[i] + end = cumulative_sum[i + 1] + # sort by score in every batch + block_idx = torch.argsort(scores[start:end], descending=True).to(device) + nms_index[start:end] = nms_index[start:end][block_idx] + if keep_top_k > 0 and end - start > keep_top_k: + # delete lines from start+keep_top_k to end to keep only top k + nms_index = torch.cat( + (nms_index[: start + keep_top_k], nms_index[end:]), dim=0 + ) + num_dets[i] -= end - start - keep_top_k + cumulative_sum -= end - start - keep_top_k + return nms_index, num_dets + + +def select_nms_index( + scores: Tensor, + boxes: Tensor, + nms_index: Tensor, + batch_size: int, + keep_top_k: int = -1, +): + if nms_index.numel() == 0: + return torch.empty(0), torch.empty(0, 4), torch.empty(0), torch.empty(0) + nms_index, num_dets = sort_nms_index(nms_index, scores, batch_size, keep_top_k) + batch_inds, cls_inds = nms_index[:, 0], nms_index[:, 1] + box_inds = nms_index[:, 2] + + # according to the nms_index to get the scores,boxes and labels + batched_scores = scores[batch_inds, cls_inds, box_inds] + batched_dets = boxes[batch_inds, box_inds, ...] + batched_labels = cls_inds + + return num_dets, batched_dets, batched_scores, batched_labels + + +def construct_indice(batch_idx, select_bbox_idxs, class_idxs, original_idxs): + num_bbox = len(select_bbox_idxs) + class_idxs = class_idxs[select_bbox_idxs] + indice = torch.zeros((num_bbox, 3), dtype=torch.int32).to(select_bbox_idxs.device) + # batch_idx + indice[:, 0] = batch_idx + # class_idxs + indice[:, 1] = class_idxs + # select_bbox_idxs + indice[:, 2] = original_idxs[select_bbox_idxs] + return indice + + +def filter_max_boxes_per_class( + select_bbox_idxs, class_idxs, max_output_boxes_per_class +): + class_counts = {} # used to track the count of each class + + filtered_select_bbox_idxs = [] + filtered_max_class_idxs = [] + + for bbox_idx, class_idx in zip(select_bbox_idxs, class_idxs): + class_count = class_counts.get( + class_idx.item(), 0 + ) # Get the count of the current class, or return 0 if it does not exist + if class_count < max_output_boxes_per_class: + filtered_select_bbox_idxs.append(bbox_idx) + filtered_max_class_idxs.append(class_idx) + class_counts[class_idx.item()] = class_count + 1 + return torch.tensor(filtered_select_bbox_idxs), torch.tensor( + filtered_max_class_idxs + ) + + +class ONNXNMSop(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: Tensor = torch.tensor([100]), + iou_threshold: Tensor = torch.tensor([0.5]), + score_threshold: Tensor = torch.tensor([0.05]) + ) -> Tensor: + """ + Non-Maximum Suppression (NMS) implementation. + + Args: + boxes (Tensor): Bounding boxes of shape (batch_size, num_boxes, 4). + scores (Tensor): Confidence scores of shape (batch_size, num_classes, num_boxes). + max_output_boxes_per_class (Tensor): Maximum number of output boxes per class. + iou_threshold (Tensor): IoU threshold for NMS. + score_threshold (Tensor): Confidence score threshold. + + Returns: + Tensor: Selected indices of shape (num_det, 3).first value is batch index, second value is class index, third value is box index + """ + device = boxes.device + batch_size, num_classes, num_boxes = scores.shape + selected_indices = [] + for batch_idx in range(batch_size): + boxes_per_image = boxes[batch_idx] + scores_per_image = scores[batch_idx] + + # If no boxes in this image, continue to the next image + if boxes_per_image.numel() == 0: + continue + + # for one box, only exist one class,so use torch.max to get the max score and class index + scores_per_image, class_idxs = torch.max(scores_per_image, dim=0) + # Apply score threshold before batched_nms bacause nms operation is time expensive + keep_idxs = scores_per_image > score_threshold + if not torch.any(keep_idxs): + # If no boxes left after applying score threshold, continue to the next image + continue + + boxes_per_image = boxes_per_image[keep_idxs] + scores_per_image = scores_per_image[keep_idxs] + class_idxs = class_idxs[keep_idxs] + + # The purpose of original_idxs is we want to return the indexs to the original input data instead of the filtered. + original_idxs = torch.arange(num_boxes, device=device)[keep_idxs] + # reference: https://pytorch.org/vision/main/generated/torchvision.ops.batched_nms.html + select_bbox_idxs = batched_nms( + boxes_per_image, scores_per_image, class_idxs, iou_threshold + ) + if ( + select_bbox_idxs.shape[0] > max_output_boxes_per_class + ): # If the boxes detected by all classes together are less than max_output_boxes_per_class, then there is no need to filter + select_bbox_idxs, _ = filter_max_boxes_per_class( + select_bbox_idxs, + class_idxs[select_bbox_idxs], + max_output_boxes_per_class, + ) + selected_indice = construct_indice( + batch_idx, select_bbox_idxs, class_idxs, original_idxs + ) + selected_indices.append(selected_indice) + if len(selected_indices) == 0: + return torch.tensor([], device=device) + selected_indices = torch.cat(selected_indices, dim=0) + return selected_indices + + @staticmethod + def symbolic( + g, + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: Tensor = torch.tensor([100]), + iou_threshold: Tensor = torch.tensor([0.5]), + score_threshold: Tensor = torch.tensor([0.05]), + ): + return g.op( + 'NonMaxSuppression', + boxes, + scores, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + outputs=1) + + +def onnx_nms( + boxes: torch.Tensor, + scores: torch.Tensor, + max_output_boxes_per_class: int = 100, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + max_output_boxes_per_class = torch.tensor([max_output_boxes_per_class]) + iou_threshold = torch.tensor([iou_threshold]).to(boxes.device) + score_threshold = torch.tensor([score_threshold]).to(boxes.device) + + batch_size, _, _ = scores.shape + if box_coding == 1: + boxes = boxes @ (_XYWH2XYXY.to(boxes.device)) + scores = scores.transpose(1, 2).contiguous() + selected_indices = ONNXNMSop.apply(boxes, scores, + max_output_boxes_per_class, + iou_threshold, score_threshold) + + num_dets, batched_dets, batched_scores, batched_labels = select_nms_index( + scores, boxes, selected_indices, batch_size, keep_top_k=keep_top_k) + + return num_dets, batched_dets, batched_scores, batched_labels.to( + torch.int32) diff --git a/models/YOLO-World/deploy/easydeploy/nms/trt_nms.py b/models/YOLO-World/deploy/easydeploy/nms/trt_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..e0db1e2164d4366ff9ce4f74d39ded917c39ba79 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/nms/trt_nms.py @@ -0,0 +1,226 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0], + [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]], + dtype=torch.float32) + + +class TRTEfficientNMSop(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + background_class: int = -1, + box_coding: int = 0, + iou_threshold: float = 0.45, + max_output_boxes: int = 100, + plugin_version: str = '1', + score_activation: int = 0, + score_threshold: float = 0.25, + ): + batch_size, _, num_classes = scores.shape + num_det = torch.randint( + 0, max_output_boxes, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, max_output_boxes, 4) + det_scores = torch.randn(batch_size, max_output_boxes) + det_classes = torch.randint( + 0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic(g, + boxes: Tensor, + scores: Tensor, + background_class: int = -1, + box_coding: int = 0, + iou_threshold: float = 0.45, + max_output_boxes: int = 100, + plugin_version: str = '1', + score_activation: int = 0, + score_threshold: float = 0.25): + out = g.op( + 'TRT::EfficientNMS_TRT', + boxes, + scores, + background_class_i=background_class, + box_coding_i=box_coding, + iou_threshold_f=iou_threshold, + max_output_boxes_i=max_output_boxes, + plugin_version_s=plugin_version, + score_activation_i=score_activation, + score_threshold_f=score_threshold, + outputs=4) + num_det, det_boxes, det_scores, det_classes = out + return num_det, det_boxes, det_scores, det_classes + + +class TRTbatchedNMSop(torch.autograd.Function): + """TensorRT NMS operation.""" + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + plugin_version: str = '1', + shareLocation: int = 1, + backgroundLabelId: int = -1, + numClasses: int = 80, + topK: int = 1000, + keepTopK: int = 100, + scoreThreshold: float = 0.25, + iouThreshold: float = 0.45, + isNormalized: int = 0, + clipBoxes: int = 0, + scoreBits: int = 16, + caffeSemantics: int = 1, + ): + batch_size, _, numClasses = scores.shape + num_det = torch.randint( + 0, keepTopK, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, keepTopK, 4) + det_scores = torch.randn(batch_size, keepTopK) + det_classes = torch.randint(0, numClasses, + (batch_size, keepTopK)).float() + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic( + g, + boxes: Tensor, + scores: Tensor, + plugin_version: str = '1', + shareLocation: int = 1, + backgroundLabelId: int = -1, + numClasses: int = 80, + topK: int = 1000, + keepTopK: int = 100, + scoreThreshold: float = 0.25, + iouThreshold: float = 0.45, + isNormalized: int = 0, + clipBoxes: int = 0, + scoreBits: int = 16, + caffeSemantics: int = 1, + ): + out = g.op( + 'TRT::BatchedNMSDynamic_TRT', + boxes, + scores, + shareLocation_i=shareLocation, + plugin_version_s=plugin_version, + backgroundLabelId_i=backgroundLabelId, + numClasses_i=numClasses, + topK_i=topK, + keepTopK_i=keepTopK, + scoreThreshold_f=scoreThreshold, + iouThreshold_f=iouThreshold, + isNormalized_i=isNormalized, + clipBoxes_i=clipBoxes, + scoreBits_i=scoreBits, + caffeSemantics_i=caffeSemantics, + outputs=4) + num_det, det_boxes, det_scores, det_classes = out + return num_det, det_boxes, det_scores, det_classes + + +def _efficient_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x1, y1 ,x2, y2]. + Set to 1 means [x, y, w, h]. + Returns: + tuple[Tensor, Tensor, Tensor, Tensor]: + (num_det, det_boxes, det_scores, det_classes), + `num_det` of shape [N, 1] + `det_boxes` of shape [N, num_det, 4] + `det_scores` of shape [N, num_det] + `det_classes` of shape [N, num_det] + """ + num_det, det_boxes, det_scores, det_classes = TRTEfficientNMSop.apply( + boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0, + score_threshold) + return num_det, det_boxes, det_scores, det_classes + + +def _batched_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x1, y1 ,x2, y2]. + Set to 1 means [x, y, w, h]. + Returns: + tuple[Tensor, Tensor, Tensor, Tensor]: + (num_det, det_boxes, det_scores, det_classes), + `num_det` of shape [N, 1] + `det_boxes` of shape [N, num_det, 4] + `det_scores` of shape [N, num_det] + `det_classes` of shape [N, num_det] + """ + if box_coding == 1: + boxes = boxes @ (_XYWH2XYXY.to(boxes.device)) + boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2) + _, _, numClasses = scores.shape + + num_det, det_boxes, det_scores, det_classes = TRTbatchedNMSop.apply( + boxes, scores, '1', 1, -1, int(numClasses), min(pre_top_k, 4096), + keep_top_k, score_threshold, iou_threshold, 0, 0, 16, 1) + + det_classes = det_classes.int() + return num_det, det_boxes, det_scores, det_classes + + +def efficient_nms(*args, **kwargs): + """Wrapper function for `_efficient_nms`.""" + return _efficient_nms(*args, **kwargs) + + +def batched_nms(*args, **kwargs): + """Wrapper function for `_batched_nms`.""" + return _batched_nms(*args, **kwargs) diff --git a/models/YOLO-World/deploy/easydeploy/onnx_demo.py b/models/YOLO-World/deploy/easydeploy/onnx_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/YOLO-World/deploy/easydeploy/tools/build_engine.py b/models/YOLO-World/deploy/easydeploy/tools/build_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..b400c9db826878a7bb0fb13f4b1dea9b793583e7 --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/tools/build_engine.py @@ -0,0 +1,136 @@ +import argparse +from pathlib import Path +from typing import List, Optional, Tuple, Union + +try: + import tensorrt as trt +except Exception: + trt = None +import warnings + +import numpy as np +import torch + +warnings.filterwarnings(action='ignore', category=DeprecationWarning) + + +class EngineBuilder: + + def __init__( + self, + checkpoint: Union[str, Path], + opt_shape: Union[Tuple, List] = (1, 3, 640, 640), + device: Optional[Union[str, int, torch.device]] = None) -> None: + checkpoint = Path(checkpoint) if isinstance(checkpoint, + str) else checkpoint + assert checkpoint.exists() and checkpoint.suffix == '.onnx' + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + + self.checkpoint = checkpoint + self.opt_shape = np.array(opt_shape, dtype=np.float32) + self.device = device + + def __build_engine(self, + scale: Optional[List[List]] = None, + fp16: bool = True, + with_profiling: bool = True) -> None: + logger = trt.Logger(trt.Logger.WARNING) + trt.init_libnvinfer_plugins(logger, namespace='') + builder = trt.Builder(logger) + config = builder.create_builder_config() + config.max_workspace_size = torch.cuda.get_device_properties( + self.device).total_memory + flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + network = builder.create_network(flag) + parser = trt.OnnxParser(network, logger) + if not parser.parse_from_file(str(self.checkpoint)): + raise RuntimeError( + f'failed to load ONNX file: {str(self.checkpoint)}') + inputs = [network.get_input(i) for i in range(network.num_inputs)] + outputs = [network.get_output(i) for i in range(network.num_outputs)] + profile = None + dshape = -1 in network.get_input(0).shape + if dshape: + profile = builder.create_optimization_profile() + if scale is None: + scale = np.array( + [[1, 1, 0.5, 0.5], [1, 1, 1, 1], [4, 1, 1.5, 1.5]], + dtype=np.float32) + scale = (self.opt_shape * scale).astype(np.int32) + elif isinstance(scale, List): + scale = np.array(scale, dtype=np.int32) + assert scale.shape[0] == 3, 'Input a wrong scale list' + else: + raise NotImplementedError + + for inp in inputs: + logger.log( + trt.Logger.WARNING, + f'input "{inp.name}" with shape{inp.shape} {inp.dtype}') + if dshape: + profile.set_shape(inp.name, *scale) + for out in outputs: + logger.log( + trt.Logger.WARNING, + f'output "{out.name}" with shape{out.shape} {out.dtype}') + if fp16 and builder.platform_has_fast_fp16: + config.set_flag(trt.BuilderFlag.FP16) + self.weight = self.checkpoint.with_suffix('.engine') + if dshape: + config.add_optimization_profile(profile) + if with_profiling: + config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED + with builder.build_engine(network, config) as engine: + self.weight.write_bytes(engine.serialize()) + logger.log( + trt.Logger.WARNING, f'Build tensorrt engine finish.\n' + f'Save in {str(self.weight.absolute())}') + + def build(self, + scale: Optional[List[List]] = None, + fp16: bool = True, + with_profiling=True): + self.__build_engine(scale, fp16, with_profiling) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument( + '--device', type=str, default='cuda:0', help='TensorRT builder device') + parser.add_argument( + '--scales', + type=str, + default='[[1,3,640,640],[1,3,640,640],[1,3,640,640]]', + help='Input scales for build dynamic input shape engine') + parser.add_argument( + '--fp16', action='store_true', help='Build model with fp16 mode') + args = parser.parse_args() + args.img_size *= 2 if len(args.img_size) == 1 else 1 + return args + + +def main(args): + img_size = (1, 3, *args.img_size) + try: + scales = eval(args.scales) + except Exception: + print('Input scales is not a python variable') + print('Set scales default None') + scales = None + builder = EngineBuilder(args.checkpoint, img_size, args.device) + builder.build(scales, fp16=args.fp16) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/models/YOLO-World/deploy/easydeploy/tools/export_onnx.py b/models/YOLO-World/deploy/easydeploy/tools/export_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..b937cc8a72b5c09d61580ddb1297213693adaf1c --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/tools/export_onnx.py @@ -0,0 +1,157 @@ +import argparse +import os +import sys +import warnings +from io import BytesIO +from pathlib import Path + +import onnx +import torch +from mmdet.apis import init_detector +from mmengine.config import ConfigDict +from mmengine.logging import print_log +from mmengine.utils.path import mkdir_or_exist + +# Add MMYOLO ROOT to sys.path +sys.path.append(str(Path(__file__).resolve().parents[3])) +from projects.easydeploy.model import DeployModel, MMYOLOBackend # noqa E402 + +warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning) +warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning) +warnings.filterwarnings(action='ignore', category=UserWarning) +warnings.filterwarnings(action='ignore', category=FutureWarning) +warnings.filterwarnings(action='ignore', category=ResourceWarning) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--model-only', action='store_true', help='Export model only') + parser.add_argument( + '--work-dir', default='./work_dir', help='Path to save export model') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument('--batch-size', type=int, default=1, help='Batch size') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--simplify', + action='store_true', + help='Simplify onnx model by onnx-sim') + parser.add_argument( + '--opset', type=int, default=11, help='ONNX opset version') + parser.add_argument( + '--backend', + type=str, + default='onnxruntime', + help='Backend for export onnx') + parser.add_argument( + '--pre-topk', + type=int, + default=1000, + help='Postprocess pre topk bboxes feed into NMS') + parser.add_argument( + '--keep-topk', + type=int, + default=100, + help='Postprocess keep topk bboxes out of NMS') + parser.add_argument( + '--iou-threshold', + type=float, + default=0.65, + help='IoU threshold for NMS') + parser.add_argument( + '--score-threshold', + type=float, + default=0.25, + help='Score threshold for NMS') + args = parser.parse_args() + args.img_size *= 2 if len(args.img_size) == 1 else 1 + return args + + +def build_model_from_cfg(config_path, checkpoint_path, device): + model = init_detector(config_path, checkpoint_path, device=device) + model.eval() + return model + + +def main(): + args = parse_args() + mkdir_or_exist(args.work_dir) + backend = MMYOLOBackend(args.backend.lower()) + if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO, + MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7): + if not args.model_only: + print_log('Export ONNX with bbox decoder and NMS ...') + else: + args.model_only = True + print_log(f'Can not export postprocess for {args.backend.lower()}.\n' + f'Set "args.model_only=True" default.') + if args.model_only: + postprocess_cfg = None + output_names = None + else: + postprocess_cfg = ConfigDict( + pre_top_k=args.pre_topk, + keep_top_k=args.keep_topk, + iou_threshold=args.iou_threshold, + score_threshold=args.score_threshold) + output_names = ['num_dets', 'boxes', 'scores', 'labels'] + baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device) + + deploy_model = DeployModel( + baseModel=baseModel, backend=backend, postprocess_cfg=postprocess_cfg) + deploy_model.eval() + + fake_input = torch.randn(args.batch_size, 3, + *args.img_size).to(args.device) + # dry run + deploy_model(fake_input) + + save_onnx_path = os.path.join( + args.work_dir, + os.path.basename(args.checkpoint).replace('pth', 'onnx')) + # export onnx + with BytesIO() as f: + torch.onnx.export( + deploy_model, + fake_input, + f, + input_names=['images'], + output_names=output_names, + opset_version=args.opset) + f.seek(0) + onnx_model = onnx.load(f) + onnx.checker.check_model(onnx_model) + + # Fix tensorrt onnx output shape, just for view + if not args.model_only and backend in (MMYOLOBackend.TENSORRT8, + MMYOLOBackend.TENSORRT7): + shapes = [ + args.batch_size, 1, args.batch_size, args.keep_topk, 4, + args.batch_size, args.keep_topk, args.batch_size, + args.keep_topk + ] + for i in onnx_model.graph.output: + for j in i.type.tensor_type.shape.dim: + j.dim_param = str(shapes.pop(0)) + if args.simplify: + try: + import onnxsim + onnx_model, check = onnxsim.simplify(onnx_model) + assert check, 'assert check failed' + except Exception as e: + print_log(f'Simplify failure: {e}') + onnx.save(onnx_model, save_onnx_path) + print_log(f'ONNX export success, save into {save_onnx_path}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/deploy/easydeploy/tools/image-demo.py b/models/YOLO-World/deploy/easydeploy/tools/image-demo.py new file mode 100644 index 0000000000000000000000000000000000000000..12ebaddce60b30021fea6a2f512cb8248db45a8e --- /dev/null +++ b/models/YOLO-World/deploy/easydeploy/tools/image-demo.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from easydeploy.model import ORTWrapper, TRTWrapper # isort:skip +import os +import random +from argparse import ArgumentParser + +import cv2 +import mmcv +import numpy as np +import torch +from mmcv.transforms import Compose +from mmdet.utils import get_test_pipeline_cfg +from mmengine.config import Config, ConfigDict +from mmengine.utils import ProgressBar, path + +from mmyolo.utils import register_all_modules +from mmyolo.utils.misc import get_file_list + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + args = parser.parse_args() + return args + + +def preprocess(config): + data_preprocess = config.get('model', {}).get('data_preprocessor', {}) + mean = data_preprocess.get('mean', [0., 0., 0.]) + std = data_preprocess.get('std', [1., 1., 1.]) + mean = torch.tensor(mean, dtype=torch.float32).reshape(1, 3, 1, 1) + std = torch.tensor(std, dtype=torch.float32).reshape(1, 3, 1, 1) + + class PreProcess(torch.nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x): + x = x[None].float() + x -= mean.to(x.device) + x /= std.to(x.device) + return x + + return PreProcess().eval() + + +def main(): + args = parse_args() + + # register all modules in mmdet into the registries + register_all_modules() + + colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(1000)] + + # build the model from a config file and a checkpoint file + if args.checkpoint.endswith('.onnx'): + model = ORTWrapper(args.checkpoint, args.device) + elif args.checkpoint.endswith('.engine') or args.checkpoint.endswith( + '.plan'): + model = TRTWrapper(args.checkpoint, args.device) + else: + raise NotImplementedError + + model.to(args.device) + + cfg = Config.fromfile(args.config) + class_names = cfg.get('class_name') + + test_pipeline = get_test_pipeline_cfg(cfg) + test_pipeline[0] = ConfigDict({'type': 'mmdet.LoadImageFromNDArray'}) + test_pipeline = Compose(test_pipeline) + + pre_pipeline = preprocess(cfg) + + if not args.show: + path.mkdir_or_exist(args.out_dir) + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + progress_bar = ProgressBar(len(files)) + for i, file in enumerate(files): + bgr = mmcv.imread(file) + rgb = mmcv.imconvert(bgr, 'bgr', 'rgb') + data, samples = test_pipeline(dict(img=rgb, img_id=i)).values() + pad_param = samples.get('pad_param', + np.array([0, 0, 0, 0], dtype=np.float32)) + h, w = samples.get('ori_shape', rgb.shape[:2]) + pad_param = torch.asarray( + [pad_param[2], pad_param[0], pad_param[2], pad_param[0]], + device=args.device) + scale_factor = samples.get('scale_factor', [1., 1]) + scale_factor = torch.asarray(scale_factor * 2, device=args.device) + data = pre_pipeline(data).to(args.device) + + result = model(data) + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # Get candidate predict info by num_dets + num_dets, bboxes, scores, labels = result + scores = scores[0, :num_dets] + bboxes = bboxes[0, :num_dets] + labels = labels[0, :num_dets] + bboxes -= pad_param + bboxes /= scale_factor + + bboxes[:, 0::2].clamp_(0, w) + bboxes[:, 1::2].clamp_(0, h) + bboxes = bboxes.round().int() + + for (bbox, score, label) in zip(bboxes, scores, labels): + bbox = bbox.tolist() + color = colors[label] + + if class_names is not None: + label_name = class_names[label] + name = f'cls:{label_name}_score:{score:0.4f}' + else: + name = f'cls:{label}_score:{score:0.4f}' + + cv2.rectangle(bgr, bbox[:2], bbox[2:], color, 2) + cv2.putText( + bgr, + name, (bbox[0], bbox[1] - 2), + cv2.FONT_HERSHEY_SIMPLEX, + 2.0, [225, 255, 255], + thickness=3) + + if args.show: + mmcv.imshow(bgr, 'result', 0) + else: + mmcv.imwrite(bgr, out_file) + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/deploy/export_onnx.py b/models/YOLO-World/deploy/export_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..4041b12112ae96d5410177c51f08fcd28ad3bb48 --- /dev/null +++ b/models/YOLO-World/deploy/export_onnx.py @@ -0,0 +1,182 @@ +# # Copyright (c) OpenMMLab. All rights reserved. +import os +import json +import warnings +import argparse +from io import BytesIO + +import onnx +import torch +from mmdet.apis import init_detector +from mmengine.config import ConfigDict +from mmengine.logging import print_log +from mmengine.utils.path import mkdir_or_exist + +from easydeploy.model import DeployModel, MMYOLOBackend # noqa E402 + +warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning) +warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning) +warnings.filterwarnings(action='ignore', category=UserWarning) +warnings.filterwarnings(action='ignore', category=FutureWarning) +warnings.filterwarnings(action='ignore', category=ResourceWarning) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument('--custom-text', + type=str, + help='custom text inputs (text json) for YOLO-World.') + parser.add_argument('--add-padding', + action="store_true", + help="add an empty padding to texts.") + parser.add_argument('--model-only', + action='store_true', + help='Export model only') + parser.add_argument('--without-nms', + action='store_true', + help='Export model without NMS') + parser.add_argument('--without-bbox-decoder', + action='store_true', + help='Export model without Bbox Decoder (for INT8 Quantization)') + parser.add_argument('--work-dir', + default='./work_dirs', + help='Path to save export model') + parser.add_argument('--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument('--batch-size', type=int, default=1, help='Batch size') + parser.add_argument('--device', + default='cuda:0', + help='Device used for inference') + parser.add_argument('--simplify', + action='store_true', + help='Simplify onnx model by onnx-sim') + parser.add_argument('--opset', + type=int, + default=11, + help='ONNX opset version') + parser.add_argument('--backend', + type=str, + default='onnxruntime', + help='Backend for export onnx') + parser.add_argument('--pre-topk', + type=int, + default=1000, + help='Postprocess pre topk bboxes feed into NMS') + parser.add_argument('--keep-topk', + type=int, + default=100, + help='Postprocess keep topk bboxes out of NMS') + parser.add_argument('--iou-threshold', + type=float, + default=0.65, + help='IoU threshold for NMS') + parser.add_argument('--score-threshold', + type=float, + default=0.25, + help='Score threshold for NMS') + args = parser.parse_args() + args.img_size *= 2 if len(args.img_size) == 1 else 1 + return args + + +def build_model_from_cfg(config_path, checkpoint_path, device): + model = init_detector(config_path, checkpoint_path, device=device) + model.eval() + return model + + +def main(): + args = parse_args() + mkdir_or_exist(args.work_dir) + backend = MMYOLOBackend(args.backend.lower()) + if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO, + MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7): + if not args.model_only: + print_log('Export ONNX with bbox decoder and NMS ...') + else: + args.model_only = True + print_log(f'Can not export postprocess for {args.backend.lower()}.\n' + f'Set "args.model_only=True" default.') + if args.model_only: + postprocess_cfg = None + output_names = None + else: + postprocess_cfg = ConfigDict(pre_top_k=args.pre_topk, + keep_top_k=args.keep_topk, + iou_threshold=args.iou_threshold, + score_threshold=args.score_threshold) + + output_names = ['num_dets', 'boxes', 'scores', 'labels'] + if args.without_bbox_decoder or args.without_nms: + output_names = ['scores', 'boxes'] + + if args.custom_text is not None and len(args.custom_text) > 0: + with open(args.custom_text) as f: + texts = json.load(f) + texts = [x[0] for x in texts] + else: + from mmdet.datasets import CocoDataset + texts = CocoDataset.METAINFO['classes'] + if args.add_padding: + texts = texts + [' '] + + baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device) + if hasattr(baseModel, 'reparameterize'): + # reparameterize text into YOLO-World + baseModel.reparameterize([texts]) + deploy_model = DeployModel(baseModel=baseModel, + backend=backend, + postprocess_cfg=postprocess_cfg, + with_nms=not args.without_nms, + without_bbox_decoder=args.without_bbox_decoder) + deploy_model.eval() + + fake_input = torch.randn(args.batch_size, 3, + *args.img_size).to(args.device) + # dry run + deploy_model(fake_input) + + save_onnx_path = os.path.join( + args.work_dir, + os.path.basename(args.checkpoint).replace('pth', 'onnx')) + # export onnx + with BytesIO() as f: + torch.onnx.export(deploy_model, + fake_input, + f, + input_names=['images'], + output_names=output_names, + opset_version=args.opset) + f.seek(0) + onnx_model = onnx.load(f) + onnx.checker.check_model(onnx_model) + + # Fix tensorrt onnx output shape, just for view + if not args.model_only and not args.without_nms and backend in ( + MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7): + shapes = [ + args.batch_size, 1, args.batch_size, args.keep_topk, 4, + args.batch_size, args.keep_topk, args.batch_size, + args.keep_topk + ] + for i in onnx_model.graph.output: + for j in i.type.tensor_type.shape.dim: + j.dim_param = str(shapes.pop(0)) + if args.simplify: + try: + import onnxsim + onnx_model, check = onnxsim.simplify(onnx_model) + assert check, 'assert check failed' + except Exception as e: + print_log(f'Simplify failure: {e}') + onnx.save(onnx_model, save_onnx_path) + print_log(f'ONNX export success, save into {save_onnx_path}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/deploy/onnx_demo.py b/models/YOLO-World/deploy/onnx_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..35f2713ecd695a18c47837d3036022983f75a254 --- /dev/null +++ b/models/YOLO-World/deploy/onnx_demo.py @@ -0,0 +1,235 @@ +import os +import json +import argparse +import os.path as osp + +import cv2 +import numpy as np +import supervision as sv +import onnxruntime as ort +from mmengine.utils import ProgressBar + +try: + import torch + from torchvision.ops import nms +except Exception as e: + print(e) + +BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1) +MASK_ANNOTATOR = sv.MaskAnnotator() + + +class LabelAnnotator(sv.LabelAnnotator): + + @staticmethod + def resolve_text_background_xyxy( + center_coordinates, + text_wh, + position, + ): + center_x, center_y = center_coordinates + text_w, text_h = text_wh + return center_x, center_y, center_x + text_w, center_y + text_h + + +LABEL_ANNOTATOR = LabelAnnotator(text_padding=4, + text_scale=0.5, + text_thickness=1) + + +def parse_args(): + parser = argparse.ArgumentParser('YOLO-World ONNX Demo') + parser.add_argument('onnx', help='onnx file') + parser.add_argument('image', help='image path, include image file or dir.') + parser.add_argument( + 'text', + help= + 'detecting texts (str or json), should be consistent with the ONNX model' + ) + parser.add_argument('--output-dir', + default='./output', + help='directory to save output files') + parser.add_argument('--device', + default='cuda:0', + help='device used for inference') + parser.add_argument( + '--onnx-nms', + action='store_false', + help='whether ONNX model contains NMS and postprocessing') + args = parser.parse_args() + return args + + +def preprocess(image, size=(640, 640)): + h, w = image.shape[:2] + max_size = max(h, w) + scale_factor = size[0] / max_size + pad_h = (max_size - h) // 2 + pad_w = (max_size - w) // 2 + pad_image = np.zeros((max_size, max_size, 3), dtype=image.dtype) + pad_image[pad_h:h + pad_h, pad_w:w + pad_w] = image + image = cv2.resize(pad_image, size, + interpolation=cv2.INTER_LINEAR).astype('float32') + image /= 255.0 + image = image[None] + return image, scale_factor, (pad_h, pad_w) + + +def visualize(image, bboxes, labels, scores, texts): + detections = sv.Detections(xyxy=bboxes, class_id=labels, confidence=scores) + labels = [ + f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in + zip(detections.class_id, detections.confidence) + ] + + image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections) + image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels) + return image + + +def inference(ort_session, + image_path, + texts, + output_dir, + size=(640, 640), + **kwargs): + # normal export + # with NMS and postprocessing + ori_image = cv2.imread(image_path) + h, w = ori_image.shape[:2] + image, scale_factor, pad_param = preprocess(ori_image[:, :, [2, 1, 0]], + size) + input_ort = ort.OrtValue.ortvalue_from_numpy(image.transpose((0, 3, 1, 2))) + results = ort_session.run(["num_dets", "labels", "scores", "boxes"], + {"images": input_ort}) + num_dets, labels, scores, bboxes = results + num_dets = num_dets[0][0] + labels = labels[0, :num_dets] + scores = scores[0, :num_dets] + bboxes = bboxes[0, :num_dets] + + bboxes -= np.array( + [pad_param[1], pad_param[0], pad_param[1], pad_param[0]]) + bboxes /= scale_factor + bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, w) + bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, h) + bboxes = bboxes.round().astype('int') + + image_out = visualize(ori_image, bboxes, labels, scores, texts) + cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image_out) + return image_out + + +def inference_with_postprocessing(ort_session, + image_path, + texts, + output_dir, + size=(640, 640), + nms_thr=0.7, + score_thr=0.3, + max_dets=300): + # export with `--without-nms` + ori_image = cv2.imread(image_path) + h, w = ori_image.shape[:2] + image, scale_factor, pad_param = preprocess(ori_image[:, :, [2, 1, 0]], + size) + input_ort = ort.OrtValue.ortvalue_from_numpy(image.transpose((0, 3, 1, 2))) + results = ort_session.run(["scores", "boxes"], {"images": input_ort}) + scores, bboxes = results + # move numpy array to torch + ori_scores = torch.from_numpy(scores[0]).to('cuda:0') + ori_bboxes = torch.from_numpy(bboxes[0]).to('cuda:0') + + scores_list = [] + labels_list = [] + bboxes_list = [] + # class-specific NMS + for cls_id in range(len(texts)): + cls_scores = ori_scores[:, cls_id] + labels = torch.ones(cls_scores.shape[0], dtype=torch.long) * cls_id + keep_idxs = nms(ori_bboxes, cls_scores, iou_threshold=nms_thr) + cur_bboxes = ori_bboxes[keep_idxs] + cls_scores = cls_scores[keep_idxs] + labels = labels[keep_idxs] + scores_list.append(cls_scores) + labels_list.append(labels) + bboxes_list.append(cur_bboxes) + + scores = torch.cat(scores_list, dim=0) + labels = torch.cat(labels_list, dim=0) + bboxes = torch.cat(bboxes_list, dim=0) + + keep_idxs = scores > score_thr + scores = scores[keep_idxs] + labels = labels[keep_idxs] + bboxes = bboxes[keep_idxs] + if len(keep_idxs) > max_dets: + _, sorted_idx = torch.sort(scores, descending=True) + keep_idxs = sorted_idx[:max_dets] + bboxes = bboxes[keep_idxs] + scores = scores[keep_idxs] + labels = labels[keep_idxs] + + # Get candidate predict info by num_dets + scores = scores.cpu().numpy() + bboxes = bboxes.cpu().numpy() + labels = labels.cpu().numpy() + + bboxes -= np.array( + [pad_param[1], pad_param[0], pad_param[1], pad_param[0]]) + bboxes /= scale_factor + bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, w) + bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, h) + bboxes = bboxes.round().astype('int') + + image_out = visualize(ori_image, bboxes, labels, scores, texts) + cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image_out) + return image_out + + +def main(): + + args = parse_args() + onnx_file = args.onnx + # init ONNX session + ort_session = ort.InferenceSession( + onnx_file, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) + print("Init ONNX Runtime session") + output_dir = "onnx_outputs" + if not osp.exists(output_dir): + os.mkdir(output_dir) + + # load images + if not osp.isfile(args.image): + images = [ + osp.join(args.image, img) for img in os.listdir(args.image) + if img.endswith('.png') or img.endswith('.jpg') + ] + else: + images = [args.image] + + if args.text.endswith('.txt'): + with open(args.text) as f: + lines = f.readlines() + texts = [[t.rstrip('\r\n')] for t in lines] + elif args.text.endswith('.json'): + texts = json.load(open(args.text)) + else: + texts = [[t.strip()] for t in args.text.split(',')] + + print("Start to inference.") + progress_bar = ProgressBar(len(images)) + + if args.onnx_nms: + inference_func = inference + else: + inference_func = inference_with_postprocessing + + for img in images: + inference_func(ort_session, img, texts, output_dir=output_dir) + progress_bar.update() + print("Finish inference") + + +if __name__ == "__main__": + main() diff --git a/models/YOLO-World/deploy/tflite_demo.py b/models/YOLO-World/deploy/tflite_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..ae5bf1a7013d07eef032917391bfa20caede8395 --- /dev/null +++ b/models/YOLO-World/deploy/tflite_demo.py @@ -0,0 +1,254 @@ +import os +import json +import argparse +import os.path as osp + +import cv2 +import tqdm +import torch +import numpy as np +import tensorflow as tf +import supervision as sv +from torchvision.ops import nms + +BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator(thickness=1) +MASK_ANNOTATOR = sv.MaskAnnotator() + + +class LabelAnnotator(sv.LabelAnnotator): + + @staticmethod + def resolve_text_background_xyxy( + center_coordinates, + text_wh, + position, + ): + center_x, center_y = center_coordinates + text_w, text_h = text_wh + return center_x, center_y, center_x + text_w, center_y + text_h + + +LABEL_ANNOTATOR = LabelAnnotator(text_padding=4, + text_scale=0.5, + text_thickness=1) + + +def parse_args(): + parser = argparse.ArgumentParser('YOLO-World TFLite (INT8) Demo') + parser.add_argument('path', help='TFLite Model `.tflite`') + parser.add_argument('image', help='image path, include image file or dir.') + parser.add_argument( + 'text', + help= + 'detecting texts (str, txt, or json), should be consistent with the ONNX model' + ) + parser.add_argument('--output-dir', + default='./output', + help='directory to save output files') + args = parser.parse_args() + return args + + +def preprocess(image, size=(640, 640)): + h, w = image.shape[:2] + max_size = max(h, w) + scale_factor = size[0] / max_size + pad_h = (max_size - h) // 2 + pad_w = (max_size - w) // 2 + pad_image = np.zeros((max_size, max_size, 3), dtype=image.dtype) + pad_image[pad_h:h + pad_h, pad_w:w + pad_w] = image + image = cv2.resize(pad_image, size, + interpolation=cv2.INTER_LINEAR).astype('float32') + image /= 255.0 + image = image[None] + return image, scale_factor, (pad_h, pad_w) + + +def generate_anchors_per_level(feat_size, stride, offset=0.5): + h, w = feat_size + shift_x = (torch.arange(0, w) + offset) * stride + shift_y = (torch.arange(0, h) + offset) * stride + yy, xx = torch.meshgrid(shift_y, shift_x) + anchors = torch.stack([xx, yy]).reshape(2, -1).transpose(0, 1) + return anchors + + +def generate_anchors(feat_sizes=[(80, 80), (40, 40), (20, 20)], + strides=[8, 16, 32], + offset=0.5): + anchors = [ + generate_anchors_per_level(fs, s, offset) + for fs, s in zip(feat_sizes, strides) + ] + anchors = torch.cat(anchors) + return anchors + + +def simple_bbox_decode(points, pred_bboxes, stride): + + pred_bboxes = pred_bboxes * stride[None, :, None] + x1 = points[..., 0] - pred_bboxes[..., 0] + y1 = points[..., 1] - pred_bboxes[..., 1] + x2 = points[..., 0] + pred_bboxes[..., 2] + y2 = points[..., 1] + pred_bboxes[..., 3] + bboxes = torch.stack([x1, y1, x2, y2], -1) + + return bboxes + + +def visualize(image, bboxes, labels, scores, texts): + detections = sv.Detections(xyxy=bboxes, class_id=labels, confidence=scores) + labels = [ + f"{texts[class_id][0]} {confidence:0.2f}" for class_id, confidence in + zip(detections.class_id, detections.confidence) + ] + + image = BOUNDING_BOX_ANNOTATOR.annotate(image, detections) + image = LABEL_ANNOTATOR.annotate(image, detections, labels=labels) + return image + + +def inference_per_sample(interp, + image_path, + texts, + priors, + strides, + output_dir, + size=(640, 640), + vis=False, + score_thr=0.05, + nms_thr=0.3, + max_dets=300): + + # input / output details from TFLite + input_details = interp.get_input_details() + output_details = interp.get_output_details() + + # load image from path + ori_image = cv2.imread(image_path) + h, w = ori_image.shape[:2] + image, scale_factor, pad_param = preprocess(ori_image[:, :, [2, 1, 0]], + size) + + # inference + interp.set_tensor(input_details[0]['index'], image) + interp.invoke() + + scores = interp.get_tensor(output_details[1]['index']) + bboxes = interp.get_tensor(output_details[0]['index']) + + # can be converted to numpy for other devices + # using torch here is only for references. + ori_scores = torch.from_numpy(scores[0]) + ori_bboxes = torch.from_numpy(bboxes) + + # decode bbox cordinates with priors + decoded_bboxes = simple_bbox_decode(priors, ori_bboxes, strides)[0] + scores_list = [] + labels_list = [] + bboxes_list = [] + for cls_id in range(len(texts)): + cls_scores = ori_scores[:, cls_id] + labels = torch.ones(cls_scores.shape[0], dtype=torch.long) * cls_id + keep_idxs = nms(decoded_bboxes, cls_scores, iou_threshold=0.5) + cur_bboxes = decoded_bboxes[keep_idxs] + cls_scores = cls_scores[keep_idxs] + labels = labels[keep_idxs] + scores_list.append(cls_scores) + labels_list.append(labels) + bboxes_list.append(cur_bboxes) + + scores = torch.cat(scores_list, dim=0) + labels = torch.cat(labels_list, dim=0) + bboxes = torch.cat(bboxes_list, dim=0) + + keep_idxs = scores > score_thr + scores = scores[keep_idxs] + labels = labels[keep_idxs] + bboxes = bboxes[keep_idxs] + # only for visualization, add an extra NMS + keep_idxs = nms(bboxes, scores, iou_threshold=nms_thr) + num_dets = min(len(keep_idxs), max_dets) + bboxes = bboxes[keep_idxs].unsqueeze(0) + scores = scores[keep_idxs].unsqueeze(0) + labels = labels[keep_idxs].unsqueeze(0) + + scores = scores[0, :num_dets].numpy() + bboxes = bboxes[0, :num_dets].numpy() + labels = labels[0, :num_dets].numpy() + + bboxes -= np.array( + [pad_param[1], pad_param[0], pad_param[1], pad_param[0]]) + bboxes /= scale_factor + bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, w) + bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, h) + + if vis: + image_out = visualize(ori_image, bboxes, labels, scores, texts) + cv2.imwrite(osp.join(output_dir, osp.basename(image_path)), image_out) + print(f"detecting {num_dets} objects.") + return image_out, ori_scores, ori_bboxes[0] + else: + return bboxes, labels, scores + + +def main(): + + args = parse_args() + tflite_file = args.tflite + # init ONNX session + interpreter = tf.lite.Interpreter(model_path=tflite_file, + experimental_preserve_all_tensors=True) + interpreter.allocate_tensors() + print("Init TFLite Interpter") + output_dir = "onnx_outputs" + if not osp.exists(output_dir): + os.mkdir(output_dir) + + # load images + if not osp.isfile(args.image): + images = [ + osp.join(args.image, img) for img in os.listdir(args.image) + if img.endswith('.png') or img.endswith('.jpg') + ] + else: + images = [args.image] + + if args.text.endswith('.txt'): + with open(args.text) as f: + lines = f.readlines() + texts = [[t.rstrip('\r\n')] for t in lines] + elif args.text.endswith('.json'): + texts = json.load(open(args.text)) + else: + texts = [[t.strip()] for t in args.text.split(',')] + + size = (640, 640) + strides = [8, 16, 32] + + # prepare anchors, since TFLite models does not contain anchors, due to INT8 quantization. + featmap_sizes = [(size[0] // s, size[1] // s) for s in strides] + flatten_priors = generate_anchors(featmap_sizes, strides=strides) + mlvl_strides = [ + flatten_priors.new_full((featmap_size[0] * featmap_size[1] * 1, ), + stride) + for featmap_size, stride in zip(featmap_sizes, strides) + ] + flatten_strides = torch.cat(mlvl_strides) + + print("Start to inference.") + for img in tqdm.tqdm(images): + inference_per_sample(interpreter, + img, + texts, + flatten_priors[None], + flatten_strides, + output_dir=output_dir, + vis=True, + score_thr=0.3, + nms_thr=0.5) + print("Finish inference") + + +if __name__ == "__main__": + main() diff --git a/models/YOLO-World/docs/data.md b/models/YOLO-World/docs/data.md new file mode 100644 index 0000000000000000000000000000000000000000..9e792f63da4f4cf73cd0c28ea3ccdd0b0b309e1d --- /dev/null +++ b/models/YOLO-World/docs/data.md @@ -0,0 +1,124 @@ +## Preparing Data for YOLO-World + +### Overview + +For pre-training YOLO-World, we adopt several datasets as listed in the below table: + +| Data | Samples | Type | Boxes | +| :-- | :-----: | :---:| :---: | +| Objects365v1 | 609k | detection | 9,621k | +| GQA | 621k | grounding | 3,681k | +| Flickr | 149k | grounding | 641k | +| CC3M-Lite | 245k | image-text | 821k | + +### Dataset Directory + +We put all data into the `data` directory, such as: + +```bash +├── coco +│ ├── annotations +│ ├── lvis +│ ├── train2017 +│ ├── val2017 +├── flickr +│ ├── annotations +│ └── images +├── mixed_grounding +│ ├── annotations +│ ├── images +├── mixed_grounding +│ ├── annotations +│ ├── images +├── objects365v1 +│ ├── annotations +│ ├── train +│ ├── val +``` +**NOTE**: We strongly suggest that you check the directories or paths in the dataset part of the config file, especially for the values `ann_file`, `data_root`, and `data_prefix`. + +We provide the annotations of the pre-training data in the below table: + +| Data | images | Annotation File | +| :--- | :------| :-------------- | +| Objects365v1 | [`Objects365 train`](https://opendatalab.com/OpenDataLab/Objects365_v1) | [`objects365_train.json`](https://opendatalab.com/OpenDataLab/Objects365_v1) | +| MixedGrounding | [`GQA`](https://nlp.stanford.edu/data/gqa/images.zip) | [`final_mixed_train_no_coco.json`](https://huggingface.co/GLIPModel/GLIP/tree/main/mdetr_annotations/final_mixed_train_no_coco.json) | +| Flickr30k | [`Flickr30k`](https://shannon.cs.illinois.edu/DenotationGraph/) |[`final_flickr_separateGT_train.json`](https://huggingface.co/GLIPModel/GLIP/tree/main/mdetr_annotations/final_flickr_separateGT_train.json) | +| LVIS-minival | [`COCO val2017`](https://cocodataset.org/) | [`lvis_v1_minival_inserted_image_name.json`](https://huggingface.co/GLIPModel/GLIP/blob/main/lvis_v1_minival_inserted_image_name.json) | + +**Acknowledgement:** We sincerely thank [GLIP](https://github.com/microsoft/GLIP) and [mdetr](https://github.com/ashkamath/mdetr) for providing the annotation files for pre-training. + + +### Dataset Class + +> For fine-tuning YOLO-World on Close-set Object Detection, using `MultiModalDataset` is recommended. + +#### Setting CLASSES/Categories + +If you use `COCO-format` custom datasets, you "DO NOT" need to define a dataset class for custom vocabularies/categories. +Explicitly setting the CLASSES in the config file through `metainfo=dict(classes=your_classes),` is simple: + +```python + +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + metainfo=dict(classes=your_classes), + data_root='data/your_data', + ann_file='annotations/your_annotation.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/your_class_texts.json', + pipeline=train_pipeline) +``` + + +For training YOLO-World, we mainly adopt two kinds of dataset classs: + +#### 1. `MultiModalDataset` + +`MultiModalDataset` is a simple wrapper for pre-defined Dataset Class, such as `Objects365` or `COCO`, which add the texts (category texts) into the dataset instance for formatting input texts. + +**Text JSON** + +The json file is formatted as follows: + +```json +[ + ['A_1','A_2'], + ['B'], + ['C_1', 'C_2', 'C_3'], + ... +] +``` + +We have provided the text json for [`LVIS`](./../data/texts/lvis_v1_class_texts.json), [`COCO`](../data/texts/coco_class_texts.json), and [`Objects365`](../data/texts/obj365v1_class_texts.json) + +#### 2. `YOLOv5MixedGroundingDataset` + +The `YOLOv5MixedGroundingDataset` extends the `COCO` dataset by supporting loading texts/captions from the json file. It's desgined for `MixedGrounding` or `Flickr30K` with text tokens for each object. + + + +### 🔥 Custom Datasets + +For custom dataset, we suggest the users convert the annotation files according to the usage. Note that, converting the annotations to the **standard COCO format** is basically required. + +1. **Large vocabulary, grounding, referring:** you can follow the annotation format as the `MixedGrounding` dataset, which adds `caption` and `tokens_positive` for assigning the text for each object. The texts can be a category or a noun phrases. + +2. **Custom vocabulary (fixed):** you can adopt the `MultiModalDataset` wrapper as the `Objects365` and create a **text json** for your custom categories. + + +### CC3M Pseudo Annotations + +The following annotations are generated according to the automatic labeling process in our paper. Adn we report the results based on these annotations. + +To use CC3M annotations, you need to prepare the `CC3M` images first. + +| Data | Images | Boxes | File | +| :--: | :----: | :---: | :---: | +| CC3M-246K | 246,363 | 820,629 | [Download 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/cc3m_pseudo_annotations.json) | +| CC3M-500K | 536,405 | 1,784,405| [Download 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/cc3m_pseudo_500k_annotations.json) | +| CC3M-750K | 750,000 | 4,504,805 | [Download 🤗](https://huggingface.co/wondervictor/YOLO-World/blob/main/cc3m_pseudo_750k_annotations.json) | \ No newline at end of file diff --git a/models/YOLO-World/docs/deploy.md b/models/YOLO-World/docs/deploy.md new file mode 100644 index 0000000000000000000000000000000000000000..b55d64c96b164a23fdf9ecad3e3dcdb288b253a0 --- /dev/null +++ b/models/YOLO-World/docs/deploy.md @@ -0,0 +1,59 @@ +## Deploy YOLO-World + +- [x] ONNX export +- [x] ONNX demo +- [ ] TensorRT +- [ ] TFLite + +We provide several ways to deploy YOLO-World with ONNX or TensorRT + +### Priliminaries + +```bash +pip install supervision onnx onnxruntime onnxsim +``` + +### Export ONNX on Gradio Demo + +start the `demo.py` and you can modify the texts in the demo and output the ONNX model. + +```bash +python demo.py path/to/config path/to/weights +``` + +### Export YOLO-World to ONNX models + +You can also use [`export_onnx.py`](../deploy/export_onnx.py) to obtain the ONNX model. You might specify the `--custom-text` with your own `Text JSON` for your custom prompts. The format of `Text JSON` can be found in [`docs/data`](../docs/data.md). + +```bash +PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 +``` + +If you don't want to include `NMS` or "post-processing" into the ONNX model, you can add `--without-nms` +```bash +PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-nms +``` + +If you want to quantize YOLO-World with ONNX model, you'd better remove `NMS` and `bbox_decoder` by adding `--without-bbox-decoder` + +```bash +PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-bbox-decoder +``` + +**Running ONNX demo** + +```bash +python deploy/onnx_demo.py path/to/model.onnx path/to/images path/to/texts +``` + + +### Export YOLO-World to TensorRT models + +coming soon. + +### FAQ + +**Q1**. `RuntimeError: Exporting the operator einsum to ONNX opset version 11 is not supported. Support for this operator was added in version 12, try exporting with this version.` + +**A:** This error arises because YOLO-World adopts `einsum` for matrix multiplication while it is not supported by `opset 11`. You can set the `--opset` from `11` to `12` if your device supports or change the `einsum` to normal `permute/reshape/multiplication` by set `use_einsum=False` in the `MaxSigmoidCSPLayerWithTwoConv` and `YOLOWorldHeadModule`. You can refer to the [sample config](../configs/pretrain/yolo_world_v2_m_vlpan_bn_noeinsum_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) without einsum. + diff --git a/models/YOLO-World/docs/faq.md b/models/YOLO-World/docs/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..48817d21558e2b481577b4dea4a74876b3e14e59 --- /dev/null +++ b/models/YOLO-World/docs/faq.md @@ -0,0 +1,9 @@ +## Frequently Asked Questions (FAQ) + + +1. ` Incorrect path_or_model_id` +```bash +OSError: class `YOLOWorldDetector` in yolo_world/models/detectors/yolo_world.py: class `MultiModalYOLOBackbone` in yolo_world/models/backbones/mm_backbone.py: class `HuggingCLIPLanguageBackbone` in yolo_world/models/backbones/mm_backbone.py: Incorrect path_or_model_id: '../pretrained_models/clip-vit-base-patch32-projection'. Please provide either the path to a local folder or the repo_id of a model on the Hub. +``` + +**Solution:** \ No newline at end of file diff --git a/models/YOLO-World/docs/finetuning.md b/models/YOLO-World/docs/finetuning.md new file mode 100644 index 0000000000000000000000000000000000000000..d128369f0c5b50f81917becd6c696f46797f452c --- /dev/null +++ b/models/YOLO-World/docs/finetuning.md @@ -0,0 +1,105 @@ +## Fine-tuning YOLO-World + +Fine-tuning YOLO-World is easy and we provide the samples for COCO object detection as a simple guidance. + + +### Fine-tuning Requirements + +Fine-tuning YOLO-World is cheap: + +* it does not require 32 GPUs for multi-node distributed training. **8 GPUs or even 1 GPU** is enough. + +* it does not require the long schedule, *e.g.,* 300 epochs or 500 epochs for training YOLOv5 or YOLOv8. **80 epochs or fewer** is enough considering that we provide the good pre-trained weights. + +### Data Preparation + +The fine-tuning dataset should have the similar format as the that of the pre-training dataset. +We suggest you refer to [`docs/data`](./data.md) for more details about how to build the datasets: + +* if you fine-tune YOLO-World for close-set / custom vocabulary object detection, using `MultiModalDataset` with a `text json` is preferred. + +* if you fine-tune YOLO-World for open-vocabulary detection with rich texts or grounding tasks, using `MixedGroundingDataset` is preferred. + +### Hyper-parameters and Config + +Please refer to the [config for fine-tuning YOLO-World-L on COCO](../configs/finetune_coco/yolo_world_l_dual_vlpan_2e-4_80e_8gpus_finetune_coco.py) for more details. + +1. Basic config file: + +If the fine-tuning dataset **contains mask annotations**: + +```python +_base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +``` + +If the fine-tuning dataset **doesn't contain mask annotations**: + +```python +_base_ = ('../../third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py') +``` + +2. Training Schemes: + +Reducing the epochs and adjusting the learning rate + +```python +max_epochs = 80 +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +close_mosaic_epochs=10 + +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) + +``` + +3. Datasets: + +```python +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/coco_class_texts.json', + pipeline=train_pipeline) +``` + +#### Finetuning without RepVL-PAN or Text Encoder 🚀 + +For further efficiency and simplicity, we can fine-tune an efficient version of YOLO-World without RepVL-PAN and the text encoder. +The efficient version of YOLO-World has the similar architecture or layers with the orignial YOLOv8 but we provide the pre-trained weights on large-scale datasets. +The pre-trained YOLO-World has strong generalization capabilities and is more robust compared to YOLOv8 trained on the COCO dataset. + +You can refer to the [config for Efficient YOLO-World](./../configs/finetune_coco/yolo_world_l_efficient_neck_2e-4_80e_8gpus_finetune_coco.py) for more details. + +The efficient YOLO-World adopts `EfficientCSPLayerWithTwoConv` and the text encoder can be removed during inference or exporting models. + +```python + +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='EfficientCSPLayerWithTwoConv'))) + +``` + +### Launch Fine-tuning! + +It's easy: + +```bash +./dist_train.sh --amp +``` diff --git a/models/YOLO-World/docs/installation.md b/models/YOLO-World/docs/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..52befc1f7801fce2d3b506cbf9f0067761caff53 --- /dev/null +++ b/models/YOLO-World/docs/installation.md @@ -0,0 +1,41 @@ +## Installation Guide + +We provide the `requirements` files in [./requirements](./../requirements/): + +* `basic_requirements`: training, finetuning, evaluation. +* `demo_requirements`: running YOLO-World [demos](./../demo/). +* `onnx_requirements`: converting YOLO-World to ONNX or TFLite models (TFLite is coming soon). + +#### Install `MMCV` + +YOLO-World adopts `mmcv>=2.0.0`. There are several ways to install `mmcv` + +**1. using `openmim`**: + +see more in [official guide](https://github.com/open-mmlab/mmcv/tree/master?tab=readme-ov-file#install-mmcv-full). + +```bash +pip install openmim +mim install mmcv==2.0.0 +``` + +**2. using `pip`**: + +go to [install-with-pip](https://mmcv.readthedocs.io/en/latest/get_started/installation.html#install-with-pip) to select the pip index. + +```bash +# cuda=11.3, torch=1.11 +pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11/index.html +# cuda=11.7, torch=1.13 +pip install mmcv==2.2.0 -f https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html +# cuda=12.1, torch=2.1 +pip install mmcv==2.1.0 -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html +``` + +**3. using `whl`** + +go to [index packages](https://download.openmmlab.com/mmcv/dist/cu117/torch1.13/index.html) to find a suitable version and download. + +```bash +pip install mmcv-2.0.1-cp38-cp38-manylinux1_x86_64.whl +``` \ No newline at end of file diff --git a/models/YOLO-World/docs/prompt_yolo_world.md b/models/YOLO-World/docs/prompt_yolo_world.md new file mode 100644 index 0000000000000000000000000000000000000000..c7ba81c99a8226f6b31c256a036967c39252d982 --- /dev/null +++ b/models/YOLO-World/docs/prompt_yolo_world.md @@ -0,0 +1,73 @@ +## Prompt YOLO-World + + +### 1. Simple YOLO-World with Embeddings + +For simplifying YOLO-World and get rid of the language model, we define a new basic detector `YOLOWorldPromptDetector`: + +The `YOLOWorldPromptDetector` supports prompt embeddings as the input and doesn't not contain a language model anymore! +Now, YOLO-World adopts `embeddings` as language inputs, and the embeddings support several kinds: (1) text embeddings from the language model, e.g., CLIP language encoder, (2) image embeddings from a vision model, e.g., CLIP vision encoder, and (3) image-text fused embeddings, and (4) random embeddings. +The (1)(2)(3) supports zero-shot inference and (4), including (1)(2)(3) are designed for prompt tuning on your custom data. + +The basic detector is defined as follows: + +```python +class YOLOWorldPromptDetector(YOLODetector): + """Implementation of YOLO World Series""" + + def __init__(self, + *args, + mm_neck: bool = False, + num_train_classes=80, + num_test_classes=80, + prompt_dim=512, + num_prompts=80, + embedding_path='', + freeze_prompt=False, + use_mlp_adapter=False, + **kwargs) +``` + +To use it in a zero-shot manner, you need to pre-compute the text embeddings (image embeddings) and save it as a `numpy array (*.npy)` with a `NxD` shape (N is the number of prompts and D is the dimension of the embeddings). Currently, we only support one prompt for one class. You can use several prompts for one class but you need to merge the results in the post-processing steps. + + +### 2. Prompt Tuning YOLO-World + +We introduce prompt tuning for YOLO-World to maintain the zero-shot ability while improve the performance on your custom datasets. + +For more details about writing configs for prompt tuning, you can refer to [`prompt tuning for COCO data`](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py). + +1. Use random prompts + +```python +dict(type='YOLOWorldPromptDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + prompt_dim=text_channels, + num_prompts=80, + ...) +``` + +2. Use CLIP embeddings (text, image, or text-image embeddings) + +the `clip_vit_b32_coco_80_embeddings.npy` can be downloaded at [HuggingFace](https://huggingface.co/wondervictor/YOLO-World/blob/main/clip_vit_b32_coco_80_embeddings.npy). + +```python +dict(type='YOLOWorldPromptDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + embedding_path='embeddings/clip_vit_b32_coco_80_embeddings.npy', + prompt_dim=text_channels, + num_prompts=80, + ...) +``` + +Using CLIP model to obtains the image and text embeddings will maintain the zero-shot performace. + + +| Model | Config | AP | AP50 | AP75 | APS | APM | APL | +| :---- | :----: | :--: | :--: | :---: | :-: | :-: | :-: | +| YOLO-World-v2-L | Zero-shot | 45.7 | 61.6 | 49.8 | 29.9 | 50.0 | 60.8 | +| [YOLO-World-v2-L](./../configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py) | Prompt tuning | 47.9 | 64.3 | 52.5 | 31.9 | 52.6 | 61.3 | diff --git a/models/YOLO-World/docs/reparameterize.md b/models/YOLO-World/docs/reparameterize.md new file mode 100644 index 0000000000000000000000000000000000000000..9115783d086812c94bf97320933d7ddae1c3847e --- /dev/null +++ b/models/YOLO-World/docs/reparameterize.md @@ -0,0 +1,77 @@ +## Reparameterize YOLO-World + +The reparameterization incorporates text embeddings as parameters into the model. For example, in the final classification layer, text embeddings are reparameterized into a simple 1x1 convolutional layer. + +
+ +
+ +### Key Advantages from Reparameterization + +> Reparameterized YOLO-World still has zero-shot ability! + +* **Efficiency:** reparameterized YOLO-World has a simple and efficient archtecture, e.g., `conv1x1` is faster than `transpose & matmul`. In addition, it enables further optmization for deployment. + +* **Accuracy:** reparameterized YOLO-World supports fine-tuning. Compared to the normal `fine-tuning` or `prompt tuning`, **reparameterized version can optimize the `neck` and `head` independently** since the `neck` and `head` have different parameters and do not depend on `text embeddings` anymore! +For example, fine-tuning the **reparameterized YOLO-World** obtains *46.3 AP* on COCO *val2017* while fine-tuning the normal version obtains *46.1 AP*, with all hyper-parameters kept the same. + +### Getting Started + +#### 1. Prepare cutstom text embeddings + +You need to generate the text embeddings by [`toos/generate_text_prompts.py`](../tools/generate_text_prompts.py) and save it as a `numpy.array` with shape `NxD`. + +#### 2. Reparameterizing + +Reparameterizing will generate a new checkpoint with text embeddings! + +Check those files first: + +* model checkpoint +* text embeddings + +We mainly reparameterize two groups of modules: + +* head (`YOLOWorldHeadModule`) +* neck (`MaxSigmoidCSPLayerWithTwoConv`) + +```bash +python tools/reparameterize_yoloworld.py \ + --model path/to/checkpoint \ + --out-dir path/to/save/re-parameterized/ \ + --text-embed path/to/text/embeddings \ + --conv-neck +``` + + +#### 3. Prepare the model config + +Please see the sample config: [`finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py`](../configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) for reparameterized training. + + +* `RepConvMaxSigmoidCSPLayerWithTwoConv`: + +```python +neck=dict(type='YOLOWorldPAFPN', + guide_channels=num_classes, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv', + guide_channels=num_classes)), +``` + +* `RepYOLOWorldHeadModule`: + +```python +bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule', + embed_dims=text_channels, + num_guide=num_classes, + num_classes=num_classes)), + +``` + +#### 4. Reparameterized Training + +**Reparameterized YOLO-World** is easier to fine-tune and can be treated as an enhanced and pre-trained YOLOv8! + +You can check [`finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py`](../configs/finetune_coco/yolo_world_v2_s_rep_vlpan_bn_2e-4_80e_8gpus_mask-refine_finetune_coco.py) for more details. \ No newline at end of file diff --git a/models/YOLO-World/docs/tflite_deploy.md b/models/YOLO-World/docs/tflite_deploy.md new file mode 100644 index 0000000000000000000000000000000000000000..4cdadffbadb54dee49e46b77a4bc118e1fb16631 --- /dev/null +++ b/models/YOLO-World/docs/tflite_deploy.md @@ -0,0 +1,78 @@ +## Run YOLO-World (Quantized) on TF-Lite + +- [x] Export YOLO-World to TFLite with INT8 Quantization. +- [x] TFLite demo + +### Priliminaries + +```bash +pip install onnxruntime onnx onnx-simplifier +pip install tensorflow==2.15.1 +``` + +See [onnx2tf](https://github.com/PINTO0309/onnx2tf) for more details about export TFLite models. +The contributor of `onnx2tf` is very nice! + +### Export TFLite INT8 Quantization models + +Please use **Reparameterized YOLO-World** for TFLite!! + +1. Prepare the ONNX model + +Please export the ONNX model without `postprocessing` and `bbox_decoder`, just add `--without-bbox-decoder`! +`bbox_decoder` is not supported for INT8 quantization, please take care! + +```bash +PYTHONPATH=./ python deploy/export_onnx.py path/to/config path/to/weights --custom-text path/to/customtexts --opset 11 --without-bbox-decoder +``` + +2. Generate the calibration samples + +Using 100 COCO images is suggested to create a simple calibration dataset for quantization. + +```python +import os +import random +from PIL import Image, ImageOps +import cv2 +import glob +import numpy as np + +root = "data/coco/val2017/" +image_list = os.listdir(root) +image_list = [os.path.join(root, f) for f in image_list] +random.shuffle(image_list) + +img_datas = [] +for idx, file in enumerate(image_list[:100]): + image = Image.open(file).convert('RGB') + # Get sample input data as a numpy array in a method of your choosing. + img_width, img_height = image.size + size = max(img_width, img_height) + image = ImageOps.pad(image, (size, size), method=Image.BILINEAR) + image = image.resize((640, 640), Image.BILINEAR) + tensor_image = np.asarray(image).astype(np.float32) + tensor_image /= 255.0 + tensor_image = np.expand_dims(tensor_image, axis=0) + img_datas.append(tensor_image) + +calib_datas = np.vstack(img_datas) +print(f'calib_datas.shape: {calib_datas.shape}') +np.save(file='tflite_calibration_data_100_images_640.npy', arr=calib_datas) + +``` + +3. Export ONNX to TFLite using `onnx2tf` + +```bash +onnx2tf -i [ONNX] -o [OUTPUT] -oiqt -cind "images" "tflite_calibration_data_100_images_640.npy" "[[[[0.,0.,0.]]]]" "[[[[1.,1.,1.]]]]" -onimc "scores" "bboxes" --verbosity debug +``` + +We provide a sample TFLite INT8 model: [yolo_world_x_coco_zeroshot_rep_integer_quant.tflite](https://huggingface.co/wondervictor/YOLO-World/blob/main/yolo_x_coco_zeroshot_rep_integer_quant.tflite) + +### Inference using TFLite + +```bash +python deploy/tflite_demo.py path/to/tflite path/to/images path/to/texts + +``` \ No newline at end of file diff --git a/models/YOLO-World/docs/updates.md b/models/YOLO-World/docs/updates.md new file mode 100644 index 0000000000000000000000000000000000000000..4605c767af577df857e9cf09cf8098a1ffb4ba7c --- /dev/null +++ b/models/YOLO-World/docs/updates.md @@ -0,0 +1,14 @@ +## Update Notes + +We provide the details for important updates of YOLO-World in this note. + +### Model Architecture + +**[2024-2-29]:** YOLO-World-v2: + +1. We remove the `I-PoolingAttention`: though it improves the performance for zero-shot LVIS evaluation, it affects the inference speeds after exporting YOLO-World to ONNX or TensorRT. Considering the trade-off, we remove the `I-PoolingAttention` in the newest version. +2. We replace the `L2-Norm` in the contrastive head with the `BatchNorm`. The `L2-Norm` contains complex operations, such as `reduce`, which is time-consuming for deployment. However, the `BatchNorm` can be fused into the convolution, which is much more efficient and also improves the zero-shot performance. + + + + diff --git a/models/YOLO-World/pyproject.toml b/models/YOLO-World/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..f56a1c239fc37ab97efa2cdc6f59fa5630ce7825 --- /dev/null +++ b/models/YOLO-World/pyproject.toml @@ -0,0 +1,56 @@ +[build-system] +requires = ["setuptools","wheel","torch"] +build-backend = "setuptools.build_meta" + +[project] +name = "yolo_world" +version = "0.1.0" +description = "YOLO-World: Real-time Open Vocabulary Object Detection" +readme = "README.md" +keywords = ["object detection"] +authors = [ + { name = "Tencent AILab", email = "ronnysong@tencent.com" }, +] +license = {text = "Apache License 2.0"} + +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +requires-python = ">= 3.7" + +dependencies = [ + "wheel", + "torch>=2.1.0", + "torchvision>=0.16.2", + "transformers", + "tokenizers", + "numpy", + "opencv-python", + "supervision==0.19.0", + "openmim", + "mmcv-lite>=2.0.0rc4", + "mmdet>=3.0.0", + "mmengine>=0.7.1", + "mmcv", + 'mmyolo @ git+https://github.com/onuralpszr/mmyolo.git', + +] + +[tool.setuptools] +package-dir = {"yolo_world" = "yolo_world"} +include-package-data = false +license-files = ["LICENSE"] +zip-safe = true + +[tool.setuptools.packages.find] +include = ["yolo_world*"] +exclude = ["docs*", "tests*","third_party*","assets*"] \ No newline at end of file diff --git a/models/YOLO-World/requirements/basic_requirements.txt b/models/YOLO-World/requirements/basic_requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9c56e20f6a955bd905ee0c02d4a71447a7f086b --- /dev/null +++ b/models/YOLO-World/requirements/basic_requirements.txt @@ -0,0 +1,9 @@ +opencv-python==4.9.0.80 +opencv-python-headless==4.2.0.34 +mmcv==2.0.0 +mmdet==3.0.0 +mmengine==0.10.3 +mmyolo==0.6.0 +timm==0.6.13 +transformers==4.36.2 +albumentations \ No newline at end of file diff --git a/models/YOLO-World/requirements/demo_requirements.txt b/models/YOLO-World/requirements/demo_requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0268ad3ce823bd680e831a8c0b98a70edc2a1c20 --- /dev/null +++ b/models/YOLO-World/requirements/demo_requirements.txt @@ -0,0 +1,2 @@ +gradio==4.16.0 +supervision \ No newline at end of file diff --git a/models/YOLO-World/requirements/onnx_requirements.txt b/models/YOLO-World/requirements/onnx_requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a96fb18e6389de5eafcd0c73289049c51f649550 --- /dev/null +++ b/models/YOLO-World/requirements/onnx_requirements.txt @@ -0,0 +1,4 @@ +supervision +onnx +onnxruntime +onnxsim \ No newline at end of file diff --git a/models/YOLO-World/third_party/mmyolo/.circleci/config.yml b/models/YOLO-World/third_party/mmyolo/.circleci/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..59ba321aeec5dd3904c8df29e2833a41dbc676f7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.circleci/config.yml @@ -0,0 +1,34 @@ +version: 2.1 + +# this allows you to use CircleCI's dynamic configuration feature +setup: true + +# the path-filtering orb is required to continue a pipeline based on +# the path of an updated fileset +orbs: + path-filtering: circleci/path-filtering@0.1.2 + +workflows: + # the always-run workflow is always triggered, regardless of the pipeline parameters. + always-run: + jobs: + # the path-filtering/filter job determines which pipeline + # parameters to update. + - path-filtering/filter: + name: check-updated-files + # 3-column, whitespace-delimited mapping. One mapping per + # line: + # + mapping: | + mmyolo/.* lint_only false + requirements/.* lint_only false + tests/.* lint_only false + tools/.* lint_only false + configs/.* lint_only false + .circleci/.* lint_only false + base-revision: main + # this is the path of the configuration we should trigger once + # path filtering and pipeline parameter value updates are + # complete. In this case, we are using the parent dynamic + # configuration itself. + config-path: .circleci/test.yml diff --git a/models/YOLO-World/third_party/mmyolo/.circleci/docker/Dockerfile b/models/YOLO-World/third_party/mmyolo/.circleci/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..d9cf8cc7712d5241975c3b748fb0d01a5545b4fd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.circleci/docker/Dockerfile @@ -0,0 +1,11 @@ +ARG PYTORCH="1.8.1" +ARG CUDA="10.2" +ARG CUDNN="7" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +# To fix GPG key error when running apt-get update +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx diff --git a/models/YOLO-World/third_party/mmyolo/.circleci/test.yml b/models/YOLO-World/third_party/mmyolo/.circleci/test.yml new file mode 100644 index 0000000000000000000000000000000000000000..149d6cac15ff9643a21535638a6cd5f961a17d4a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.circleci/test.yml @@ -0,0 +1,213 @@ +version: 2.1 + +# the default pipeline parameters, which will be updated according to +# the results of the path-filtering orb +parameters: + lint_only: + type: boolean + default: true + +jobs: + lint: + docker: + - image: cimg/python:3.7.4 + steps: + - checkout + - run: + name: Install pre-commit hook + command: | + pip install pre-commit + pre-commit install + - run: + name: Linting + command: pre-commit run --all-files + - run: + name: Check docstring coverage + command: | + pip install interrogate + interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-magic --ignore-regex "__repr__" --fail-under 90 mmyolo + build_cpu: + parameters: + # The python version must match available image tags in + # https://circleci.com/developer/images/image/cimg/python + python: + type: string + torch: + type: string + torchvision: + type: string + docker: + - image: cimg/python:<< parameters.python >> + resource_class: large + steps: + - checkout + - run: + name: Install Libraries + command: | + sudo apt-get update + sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5 + - run: + name: Configure Python & pip + command: | + pip install --upgrade pip + pip install wheel + - run: + name: Install PyTorch + command: | + python -V + pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html + - run: + name: Install ONNXRuntime + command: | + pip install onnxruntime==1.8.1 + wget https://github.com/microsoft/onnxruntime/releases/download/v1.8.1/onnxruntime-linux-x64-1.8.1.tgz + tar xvf onnxruntime-linux-x64-1.8.1.tgz + - run: + name: Install mmyolo dependencies + command: | + pip install -U openmim + mim install git+https://github.com/open-mmlab/mmengine.git@main + mim install 'mmcv >= 2.0.0' + mim install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x + pip install -r requirements/albu.txt + pip install -r requirements/tests.txt + - run: + name: Install mmdeploy + command: | + pip install setuptools + git clone -b dev-1.x --depth 1 https://github.com/open-mmlab/mmdeploy.git mmdeploy --recurse-submodules + wget https://github.com/Kitware/CMake/releases/download/v3.20.0/cmake-3.20.0-linux-x86_64.tar.gz + tar -xzvf cmake-3.20.0-linux-x86_64.tar.gz + sudo ln -sf $(pwd)/cmake-3.20.0-linux-x86_64/bin/* /usr/bin/ + cd mmdeploy && mkdir build && cd build && cmake .. -DMMDEPLOY_TARGET_BACKENDS=ort -DONNXRUNTIME_DIR=/home/circleci/project/onnxruntime-linux-x64-1.8.1 && make -j8 && make install + export LD_LIBRARY_PATH=/home/circleci/project/onnxruntime-linux-x64-1.8.1/lib:${LD_LIBRARY_PATH} + cd /home/circleci/project/mmdeploy && python -m pip install -v -e . + - run: + name: Build and install + command: | + pip install -e . + - run: + name: Run unittests + command: | + export LD_LIBRARY_PATH=/home/circleci/project/onnxruntime-linux-x64-1.8.1/lib:${LD_LIBRARY_PATH} + pytest tests/ +# coverage run --branch --source mmyolo -m pytest tests/ +# coverage xml +# coverage report -m + build_cuda: + parameters: + torch: + type: string + cuda: + type: enum + enum: ["10.1", "10.2", "11.0", "11.7"] + cudnn: + type: integer + default: 7 + machine: + image: ubuntu-2004-cuda-11.4:202110-01 + # docker_layer_caching: true + resource_class: gpu.nvidia.small + steps: + - checkout + - run: + # Cloning repos in VM since Docker doesn't have access to the private key + name: Clone Repos + command: | + git clone -b main --depth 1 https://github.com/open-mmlab/mmengine.git /home/circleci/mmengine + git clone -b dev-3.x --depth 1 https://github.com/open-mmlab/mmdetection.git /home/circleci/mmdetection + - run: + name: Build Docker image + command: | + docker build .circleci/docker -t mmyolo:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >> + docker run --gpus all -t -d -v /home/circleci/project:/mmyolo -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmdetection:/mmdetection -w /mmyolo --name mmyolo mmyolo:gpu + - run: + name: Install mmyolo dependencies + command: | + docker exec mmyolo pip install -U openmim + docker exec mmyolo mim install -e /mmengine + docker exec mmyolo mim install 'mmcv >= 2.0.0' + docker exec mmyolo pip install -e /mmdetection + docker exec mmyolo pip install -r requirements/albu.txt + docker exec mmyolo pip install -r requirements/tests.txt + - run: + name: Build and install + command: | + docker exec mmyolo pip install -e . + - run: + name: Run unittests + command: | + docker exec mmyolo pytest tests/ + +workflows: + pr_stage_lint: + when: << pipeline.parameters.lint_only >> + jobs: + - lint: + name: lint + filters: + branches: + ignore: + - main + + pr_stage_test: + when: + not: << pipeline.parameters.lint_only >> + jobs: + - lint: + name: lint + filters: + branches: + ignore: + - main + - build_cpu: + name: minimum_version_cpu + torch: 1.8.0 + torchvision: 0.9.0 + python: 3.8.0 # The lowest python 3.7.x version available on CircleCI images + requires: + - lint + - build_cpu: + name: maximum_version_cpu + # mmdeploy not supported +# torch: 2.0.0 +# torchvision: 0.15.1 + torch: 1.12.1 + torchvision: 0.13.1 + python: 3.9.0 + requires: + - minimum_version_cpu + - hold: + type: approval + requires: + - maximum_version_cpu + - build_cuda: + name: mainstream_version_gpu + torch: 1.8.1 + # Use double quotation mark to explicitly specify its type + # as string instead of number + cuda: "10.2" + requires: + - hold + - build_cuda: + name: maximum_version_gpu + torch: 2.0.0 + cuda: "11.7" + cudnn: 8 + requires: + - hold + merge_stage_test: + when: + not: << pipeline.parameters.lint_only >> + jobs: + - build_cuda: + name: minimum_version_gpu + torch: 1.7.0 + # Use double quotation mark to explicitly specify its type + # as string instead of number + cuda: "11.0" + cudnn: 8 + filters: + branches: + only: + - main diff --git a/models/YOLO-World/third_party/mmyolo/.dev_scripts/gather_models.py b/models/YOLO-World/third_party/mmyolo/.dev_scripts/gather_models.py new file mode 100644 index 0000000000000000000000000000000000000000..f05e2b5b31329e12f1bd62196de6592fade0a7c8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.dev_scripts/gather_models.py @@ -0,0 +1,312 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os +import os.path as osp +import shutil +import subprocess +import time +from collections import OrderedDict + +import torch +import yaml +from mmengine.config import Config +from mmengine.fileio import dump +from mmengine.utils import mkdir_or_exist, scandir + + +def ordered_yaml_dump(data, stream=None, Dumper=yaml.SafeDumper, **kwds): + + class OrderedDumper(Dumper): + pass + + def _dict_representer(dumper, data): + return dumper.represent_mapping( + yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, data.items()) + + OrderedDumper.add_representer(OrderedDict, _dict_representer) + return yaml.dump(data, stream, OrderedDumper, **kwds) + + +def process_checkpoint(in_file, out_file): + checkpoint = torch.load(in_file, map_location='cpu') + # remove optimizer for smaller file size + if 'optimizer' in checkpoint: + del checkpoint['optimizer'] + if 'message_hub' in checkpoint: + del checkpoint['message_hub'] + if 'ema_state_dict' in checkpoint: + del checkpoint['ema_state_dict'] + + for key in list(checkpoint['state_dict']): + if key.startswith('data_preprocessor'): + checkpoint['state_dict'].pop(key) + elif 'priors_base_sizes' in key: + checkpoint['state_dict'].pop(key) + elif 'grid_offset' in key: + checkpoint['state_dict'].pop(key) + elif 'prior_inds' in key: + checkpoint['state_dict'].pop(key) + + # if it is necessary to remove some sensitive data in checkpoint['meta'], + # add the code here. + if torch.__version__ >= '1.6': + torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False) + else: + torch.save(checkpoint, out_file) + sha = subprocess.check_output(['sha256sum', out_file]).decode() + final_file = out_file.rstrip('.pth') + f'-{sha[:8]}.pth' + subprocess.Popen(['mv', out_file, final_file]) + return final_file + + +def is_by_epoch(config): + cfg = Config.fromfile('./configs/' + config) + return cfg.train_cfg.type == 'EpochBasedTrainLoop' + + +def get_final_epoch_or_iter(config): + cfg = Config.fromfile('./configs/' + config) + if cfg.train_cfg.type == 'EpochBasedTrainLoop': + return cfg.train_cfg.max_epochs + else: + return cfg.train_cfg.max_iters + + +def get_best_epoch_or_iter(exp_dir): + best_epoch_iter_full_path = list( + sorted(glob.glob(osp.join(exp_dir, 'best_*.pth'))))[-1] + best_epoch_or_iter_model_path = best_epoch_iter_full_path.split('/')[-1] + best_epoch_or_iter = best_epoch_or_iter_model_path. \ + split('_')[-1].split('.')[0] + return best_epoch_or_iter_model_path, int(best_epoch_or_iter) + + +def get_real_epoch_or_iter(config): + cfg = Config.fromfile('./configs/' + config) + if cfg.train_cfg.type == 'EpochBasedTrainLoop': + epoch = cfg.train_cfg.max_epochs + return epoch + else: + return cfg.runner.max_iters + + +def get_final_results(log_json_path, + epoch_or_iter, + results_lut='coco/bbox_mAP', + by_epoch=True): + result_dict = dict() + with open(log_json_path) as f: + r = f.readlines()[-1] + last_metric = r.split(',')[0].split(': ')[-1].strip() + result_dict[results_lut] = last_metric + return result_dict + + +def get_dataset_name(config): + # If there are more dataset, add here. + name_map = dict( + CityscapesDataset='Cityscapes', + CocoDataset='COCO', + PoseCocoDataset='COCO Person', + YOLOv5CocoDataset='COCO', + CocoPanopticDataset='COCO', + YOLOv5DOTADataset='DOTA 1.0', + DeepFashionDataset='Deep Fashion', + LVISV05Dataset='LVIS v0.5', + LVISV1Dataset='LVIS v1', + VOCDataset='Pascal VOC', + YOLOv5VOCDataset='Pascal VOC', + WIDERFaceDataset='WIDER Face', + OpenImagesDataset='OpenImagesDataset', + OpenImagesChallengeDataset='OpenImagesChallengeDataset') + cfg = Config.fromfile('./configs/' + config) + return name_map[cfg.dataset_type] + + +def find_last_dir(model_dir): + dst_times = [] + for time_stamp in os.scandir(model_dir): + if osp.isdir(time_stamp): + dst_time = time.mktime( + time.strptime(time_stamp.name, '%Y%m%d_%H%M%S')) + dst_times.append([dst_time, time_stamp.name]) + return max(dst_times, key=lambda x: x[0])[1] + + +def convert_model_info_to_pwc(model_infos): + pwc_files = {} + for model in model_infos: + cfg_folder_name = osp.split(model['config'])[-2] + pwc_model_info = OrderedDict() + pwc_model_info['Name'] = osp.split(model['config'])[-1].split('.')[0] + pwc_model_info['In Collection'] = 'Please fill in Collection name' + pwc_model_info['Config'] = osp.join('configs', model['config']) + + # get metadata + meta_data = OrderedDict() + if 'epochs' in model: + meta_data['Epochs'] = get_real_epoch_or_iter(model['config']) + else: + meta_data['Iterations'] = get_real_epoch_or_iter(model['config']) + pwc_model_info['Metadata'] = meta_data + + # get dataset name + dataset_name = get_dataset_name(model['config']) + + # get results + results = [] + # if there are more metrics, add here. + if 'bbox_mAP' in model['results']: + metric = round(model['results']['bbox_mAP'] * 100, 1) + results.append( + OrderedDict( + Task='Object Detection', + Dataset=dataset_name, + Metrics={'box AP': metric})) + if 'segm_mAP' in model['results']: + metric = round(model['results']['segm_mAP'] * 100, 1) + results.append( + OrderedDict( + Task='Instance Segmentation', + Dataset=dataset_name, + Metrics={'mask AP': metric})) + if 'PQ' in model['results']: + metric = round(model['results']['PQ'], 1) + results.append( + OrderedDict( + Task='Panoptic Segmentation', + Dataset=dataset_name, + Metrics={'PQ': metric})) + pwc_model_info['Results'] = results + + link_string = 'https://download.openmmlab.com/mmyolo/v0/' + link_string += '{}/{}'.format(model['config'].rstrip('.py'), + osp.split(model['model_path'])[-1]) + pwc_model_info['Weights'] = link_string + if cfg_folder_name in pwc_files: + pwc_files[cfg_folder_name].append(pwc_model_info) + else: + pwc_files[cfg_folder_name] = [pwc_model_info] + return pwc_files + + +def parse_args(): + parser = argparse.ArgumentParser(description='Gather benchmarked models') + parser.add_argument( + 'root', + type=str, + help='root path of benchmarked models to be gathered') + parser.add_argument( + 'out', type=str, help='output path of gathered models to be stored') + parser.add_argument( + '--best', + action='store_true', + help='whether to gather the best model.') + + args = parser.parse_args() + return args + + +# TODO: Refine +def main(): + args = parse_args() + models_root = args.root + models_out = args.out + mkdir_or_exist(models_out) + + # find all models in the root directory to be gathered + raw_configs = list(scandir('./configs', '.py', recursive=True)) + + # filter configs that is not trained in the experiments dir + used_configs = [] + for raw_config in raw_configs: + if osp.exists(osp.join(models_root, raw_config)): + used_configs.append(raw_config) + print(f'Find {len(used_configs)} models to be gathered') + + # find final_ckpt and log file for trained each config + # and parse the best performance + model_infos = [] + for used_config in used_configs: + exp_dir = osp.join(models_root, used_config) + by_epoch = is_by_epoch(used_config) + # check whether the exps is finished + if args.best is True: + final_model, final_epoch_or_iter = get_best_epoch_or_iter(exp_dir) + else: + final_epoch_or_iter = get_final_epoch_or_iter(used_config) + final_model = '{}_{}.pth'.format('epoch' if by_epoch else 'iter', + final_epoch_or_iter) + + model_path = osp.join(exp_dir, final_model) + # skip if the model is still training + if not osp.exists(model_path): + continue + + # get the latest logs + latest_exp_name = find_last_dir(exp_dir) + latest_exp_json = osp.join(exp_dir, latest_exp_name, 'vis_data', + latest_exp_name + '.json') + + model_performance = get_final_results( + latest_exp_json, final_epoch_or_iter, by_epoch=by_epoch) + + if model_performance is None: + continue + + model_info = dict( + config=used_config, + results=model_performance, + final_model=final_model, + latest_exp_json=latest_exp_json, + latest_exp_name=latest_exp_name) + model_info['epochs' if by_epoch else 'iterations'] = \ + final_epoch_or_iter + model_infos.append(model_info) + + # publish model for each checkpoint + publish_model_infos = [] + for model in model_infos: + model_publish_dir = osp.join(models_out, model['config'].rstrip('.py')) + mkdir_or_exist(model_publish_dir) + + model_name = osp.split(model['config'])[-1].split('.')[0] + + model_name += '_' + model['latest_exp_name'] + publish_model_path = osp.join(model_publish_dir, model_name) + trained_model_path = osp.join(models_root, model['config'], + model['final_model']) + + # convert model + final_model_path = process_checkpoint(trained_model_path, + publish_model_path) + + # copy log + shutil.copy(model['latest_exp_json'], + osp.join(model_publish_dir, f'{model_name}.log.json')) + + # copy config to guarantee reproducibility + config_path = model['config'] + config_path = osp.join( + 'configs', + config_path) if 'configs' not in config_path else config_path + target_config_path = osp.split(config_path)[-1] + shutil.copy(config_path, osp.join(model_publish_dir, + target_config_path)) + + model['model_path'] = final_model_path + publish_model_infos.append(model) + + models = dict(models=publish_model_infos) + print(f'Totally gathered {len(publish_model_infos)} models') + dump(models, osp.join(models_out, 'model_info.json')) + + pwc_files = convert_model_info_to_pwc(publish_model_infos) + for name in pwc_files: + with open(osp.join(models_out, name + '_metafile.yml'), 'w') as f: + ordered_yaml_dump(pwc_files[name], f, encoding='utf-8') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/.dev_scripts/print_registers.py b/models/YOLO-World/third_party/mmyolo/.dev_scripts/print_registers.py new file mode 100644 index 0000000000000000000000000000000000000000..52646da205969db62d3d59dc2736be00954510e2 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.dev_scripts/print_registers.py @@ -0,0 +1,448 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import importlib +import os +import os.path as osp +import pkgutil +import sys +import tempfile +from multiprocessing import Pool +from pathlib import Path + +import numpy as np +import pandas as pd + +# host_addr = 'https://gitee.com/open-mmlab' +host_addr = 'https://github.com/open-mmlab' +tools_list = ['tools', '.dev_scripts'] +proxy_names = { + 'mmdet': 'mmdetection', + 'mmseg': 'mmsegmentation', + 'mmcls': 'mmclassification' +} +merge_module_keys = {'mmcv': ['mmengine']} +# exclude_prefix = {'mmcv': ['{_k}') + table_data.append((registry_name, registry_strings)) + + # sort the data list + table_data = sorted(table_data, key=lambda x: len(x[1])) + # split multi parts + table_data_multi_parts = [] + for (registry_name, registry_strings) in table_data: + multi_parts = False + if len(registry_strings) > max_size_per_cell: + multi_parts = True + for cell_idx, registry_cell in enumerate( + divide_list_into_groups(registry_strings, max_size_per_cell)): + registry_str = ''.join(registry_cell.tolist()) + registry_str = f'
    {registry_str}
' + table_data_multi_parts.append([ + registry_name if not multi_parts else + f'{registry_name} (part {cell_idx + 1})', registry_str + ]) + + for table_data in divide_list_into_groups(table_data_multi_parts, + max_col_per_row): + table_data = list(zip(*table_data.tolist())) + html += dataframe_to_html( + pd.DataFrame([table_data[1]], columns=table_data[0])) + if html: + html = f'
{title}
\n{html}' + html = f'
{html}
\n' + return html + + +def tools_to_html(tools_dict, repo_name=''): + + def _recurse(_dict, _connector, _result): + assert isinstance(_dict, dict), \ + f'unknown recurse type: {_dict} ({type(_dict)})' + for _k, _v in _dict.items(): + if _v is None: + if _connector not in _result: + _result[_connector] = [] + _result[_connector].append(_k) + else: + _recurse(_v, osp.join(_connector, _k), _result) + + table_data = {} + title = f'{capitalize(repo_name)} Tools' + _recurse(tools_dict, '', table_data) + return registries_to_html(table_data, title) + + +def dataframe_to_html(dataframe): + styler = dataframe.style + styler = styler.hide(axis='index') + styler = styler.format(na_rep='-') + styler = styler.set_properties(**{ + 'text-align': 'left', + 'align': 'center', + 'vertical-align': 'top' + }) + styler = styler.set_table_styles([{ + 'selector': + 'thead th', + 'props': + 'align:center;text-align:center;vertical-align:bottom' + }]) + html = styler.to_html() + html = f'
\n{html}
' + return html + + +def generate_markdown_by_repository(repo_name, + module_name, + branch, + pulldir, + throw_error=False): + # add the pull dir to the system path so that it can be found + if pulldir not in sys.path: + sys.path.insert(0, pulldir) + module_list, error_dict = load_modules_from_dir( + module_name, pulldir, throw_error=throw_error) + registries_tree = get_registries_from_modules(module_list) + if error_dict: + error_dict_name = 'error_modules' + assert (error_dict_name not in registries_tree), \ + f'duplicate module name was found: {error_dict_name}' + registries_tree.update({error_dict_name: error_dict}) + # get the tools files + for tools_name in tools_list: + assert (tools_name not in registries_tree), \ + f'duplicate tools name was found: {tools_name}' + tools_tree = osp.join(pulldir, tools_name) + tools_tree = get_scripts_from_dir(tools_tree) + registries_tree.update({tools_name: tools_tree}) + # print_tree(registries_tree) + # get registries markdown string + module_registries = registries_tree.get(module_name, {}) + for merge_key in merge_module_keys.get(module_name, []): + merge_dict = registries_tree.get(merge_key, {}) + merge_registries(module_registries, merge_dict) + for exclude_key in exclude_prefix.get(module_name, []): + exclude_registries(module_registries, exclude_key) + markdown_str = registries_to_html( + module_registries, title=f'{capitalize(repo_name)} Module Components') + # get tools markdown string + tools_registries = {} + for tools_name in tools_list: + tools_registries.update( + {tools_name: registries_tree.get(tools_name, {})}) + markdown_str += tools_to_html(tools_registries, repo_name=repo_name) + version_str = get_version_from_module_name(module_name, branch) + title_str = f'\n\n## {capitalize(repo_name)}{version_str}\n' + # remove the pull dir from system path + if pulldir in sys.path: + sys.path.remove(pulldir) + return f'{title_str}{markdown_str}' + + +def parse_args(): + parser = argparse.ArgumentParser( + description='print registries in openmmlab repositories') + parser.add_argument( + '-r', + '--repositories', + nargs='+', + default=['mmdet', 'mmcls', 'mmseg', 'mmengine', 'mmcv'], + type=str, + help='git repositories name in OpenMMLab') + parser.add_argument( + '-b', + '--branches', + nargs='+', + default=['3.x', '1.x', '1.x', 'main', '2.x'], + type=str, + help='the branch names of git repositories, the length of branches ' + 'must be same as the length of repositories') + parser.add_argument( + '-o', '--out', type=str, default='.', help='output path of the file') + parser.add_argument( + '--throw-error', + action='store_true', + default=False, + help='whether to throw error when trying to import modules') + args = parser.parse_args() + return args + + +# TODO: Refine +def main(): + args = parse_args() + repositories = args.repositories + branches = args.branches + assert isinstance(repositories, list), \ + 'Type of repositories must be list' + if branches is None: + branches = [None] * len(repositories) + assert isinstance(branches, list) and \ + len(branches) == len(repositories), \ + 'The length of branches must be same as ' \ + 'that of repositories' + assert isinstance(args.out, str), \ + 'The type of output path must be string' + # save path of file + mkdir_or_exist(args.out) + save_path = osp.join(args.out, 'registries_info.md') + with tempfile.TemporaryDirectory() as tmpdir: + # multi process init + pool = Pool(processes=len(repositories)) + multi_proc_input_list = [] + multi_proc_output_list = [] + # get the git repositories + for branch, repository in zip(branches, repositories): + repo_name, module_name = parse_repo_name(repository) + pulldir = osp.join(tmpdir, f'tmp_{repo_name}') + git_pull_branch( + repo_name=repo_name, branch_name=branch, pulldir=pulldir) + multi_proc_input_list.append( + (repo_name, module_name, branch, pulldir, args.throw_error)) + print('starting the multi process to get the registries') + for multi_proc_input in multi_proc_input_list: + multi_proc_output_list.append( + pool.apply_async(generate_markdown_by_repository, + multi_proc_input)) + pool.close() + pool.join() + with open(save_path, 'w', encoding='utf-8') as fw: + fw.write(f'{markdown_title}\n') + for multi_proc_output in multi_proc_output_list: + markdown_str = multi_proc_output.get() + fw.write(f'{markdown_str}\n') + print(f'saved registries to the path: {save_path}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/.github/CODE_OF_CONDUCT.md b/models/YOLO-World/third_party/mmyolo/.github/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..92afad1c5ab5d5781115dee45c131d3751d3cd31 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +- The use of sexualized language or imagery and unwelcome sexual attention or + advances +- Trolling, insulting/derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or electronic + address, without explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at chenkaidev@gmail.com. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq + +[homepage]: https://www.contributor-covenant.org diff --git a/models/YOLO-World/third_party/mmyolo/.github/CONTRIBUTING.md b/models/YOLO-World/third_party/mmyolo/.github/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..4ac764f10587497cb6da5ba453c08056d5bc9df7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/CONTRIBUTING.md @@ -0,0 +1 @@ +We appreciate all contributions to improve MMYOLO. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) in MMCV for more details about the contributing guideline. diff --git a/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/1-bug-report.yml b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/1-bug-report.yml new file mode 100644 index 0000000000000000000000000000000000000000..0cec5853ebbde572c2c6322f9d7123cac5a97df7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/1-bug-report.yml @@ -0,0 +1,67 @@ +name: "🐞 Bug report" +description: "Create a report to help us reproduce and fix the bug" + + +body: + - type: markdown + attributes: + value: | + Thank you for reporting this issue to help us improve! + If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [here](https://github.com/open-mmlab/mmyolo/pulls)! + If this issue is about installing MMCV, please file an issue at [MMCV](https://github.com/open-mmlab/mmcv/issues/new/choose). + If you need our help, please fill in as much of the following form as you're able. + + - type: checkboxes + attributes: + label: Prerequisite + description: Please check the following items before creating a new issue. + options: + - label: I have searched [the existing and past issues](https://github.com/open-mmlab/mmyolo/issues) but cannot get the expected help. + required: true + - label: I have read the [FAQ documentation](https://mmyolo.readthedocs.io/en/latest/faq.html) but cannot get the expected help. + required: true + - label: The bug has not been fixed in the [latest version](https://github.com/open-mmlab/mmyolo). + required: true + + - type: textarea + attributes: + label: 🐞 Describe the bug + description: | + Please provide a clear and concise description of what the bug is. + Preferably a simple and minimal code snippet that we can reproduce the error by running the code. + placeholder: | + A clear and concise description of what the bug is. + + ```python + # Sample code to reproduce the problem + ``` + + ```shell + The command or script you run. + ``` + + ``` + The error message or logs you got, with the full traceback. + ``` + validations: + required: true + + - type: textarea + attributes: + label: Environment + description: | + Please run `python mmyolo/utils/collect_env.py` to collect necessary environment information and paste it here. + You may add addition that may be helpful for locating the problem, such as + - How you installed PyTorch \[e.g., pip, conda, source\] + - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) + validations: + required: true + + - type: textarea + attributes: + label: Additional information + description: Tell us anything else you think we should know. + placeholder: | + 1. Did you make any modifications on the code or config? Did you understand what you have modified? + 2. What dataset did you use? + 3. What do you think might be the reason? diff --git a/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/2-feature-request.yml b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/2-feature-request.yml new file mode 100644 index 0000000000000000000000000000000000000000..8b24846777e89685bcb99c5d79663839536b6607 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/2-feature-request.yml @@ -0,0 +1,32 @@ +name: 🚀 Feature request +description: Suggest an idea for this project +labels: [feature request] + +body: + - type: markdown + attributes: + value: | + Thank you for suggesting an idea to make MMYOLO better. + We strongly appreciate you creating a PR to implete this feature [here](https://github.com/open-mmlab/mmyolo/pulls)! + + If you need our help, please fill in as much of the following form as you're able. + + - type: textarea + attributes: + label: What is the problem this feature will solve? + placeholder: | + E.g., It is inconvenient when \[....\]. + validations: + required: true + + - type: textarea + attributes: + label: What is the feature you are proposing to solve the problem? + validations: + required: true + + - type: textarea + attributes: + label: What alternatives have you considered? + description: | + Add any other context or screenshots about the feature request here. diff --git a/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/3-new-model.yml b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/3-new-model.yml new file mode 100644 index 0000000000000000000000000000000000000000..2aacff4abc353c1e999c8e5952c86ffcac38b063 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/3-new-model.yml @@ -0,0 +1,30 @@ +name: "\U0001F31F New model/dataset addition" +description: Submit a proposal/request to implement a new model / dataset +labels: [ "New model/dataset" ] + +body: + - type: textarea + id: description-request + validations: + required: true + attributes: + label: Model/Dataset description + description: | + Put any and all important information relative to the model/dataset + + - type: checkboxes + attributes: + label: Open source status + description: | + Please provide the open-source status, which would be very helpful + options: + - label: "The model implementation is available" + - label: "The model weights are available." + + - type: textarea + id: additional-info + attributes: + label: Provide useful links for the implementation + description: | + Please provide information regarding the implementation, the weights, and the authors. + Please mention the authors by @gh-username if you're aware of their usernames. diff --git a/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/4-documentation.yml b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/4-documentation.yml new file mode 100644 index 0000000000000000000000000000000000000000..dbf1ef8107a33c41067743097ba78e047be43cdb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/4-documentation.yml @@ -0,0 +1,22 @@ +name: 📚 Documentation +description: Report an issue related to https://mmyolo.readthedocs.io/en/latest/. + +body: +- type: textarea + attributes: + label: 📚 The doc issue + description: > + A clear and concise description of what content in https://mmyolo.readthedocs.io/en/latest/ is an issue. + validations: + required: true + +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. + +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/5-reimplementation.yml b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/5-reimplementation.yml new file mode 100644 index 0000000000000000000000000000000000000000..1240aa896a50151ad47cc1bf0813d0b40d7e7169 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/5-reimplementation.yml @@ -0,0 +1,87 @@ +name: "💥 Reimplementation Questions" +description: "Ask about questions during model reimplementation" + + +body: + - type: markdown + attributes: + value: | + If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [here](https://github.com/open-mmlab/mmyolo/pulls)! + + - type: checkboxes + attributes: + label: Prerequisite + description: Please check the following items before creating a new issue. + options: + - label: I have searched [the existing and past issues](https://github.com/open-mmlab/mmyolo/issues) but cannot get the expected help. + required: true + - label: I have read the [FAQ documentation](https://mmyolo.readthedocs.io/en/latest/faq.html) but cannot get the expected help. + required: true + - label: The bug has not been fixed in the [latest version](https://github.com/open-mmlab/mmyolo). + required: true + validations: + required: true + + - type: textarea + attributes: + label: 💬 Describe the reimplementation questions + description: | + A clear and concise description of what the problem you meet and what have you done. + There are several common situations in the reimplementation issues as below + + 1. Reimplement a model in the model zoo using the provided configs + 2. Reimplement a model in the model zoo on other dataset (e.g., custom datasets) + 3. Reimplement a custom model but all the components are implemented in MMDetection + 4. Reimplement a custom model with new modules implemented by yourself + + There are several things to do for different cases as below. + + - For case 1 & 3, please follow the steps in the following sections thus we could help to quick identify the issue. + - For case 2 & 4, please understand that we are not able to do much help here because we usually do not know the full code and the users should be responsible to the code they write. + - One suggestion for case 2 & 4 is that the users should first check whether the bug lies in the self-implemented code or the original code. For example, users can first make sure that the same model runs well on supported datasets. If you still need help, please describe what you have done and what you obtain in the issue, and follow the steps in the following sections and try as clear as possible so that we can better help you. + placeholder: | + A clear and concise description of what the bug is. + What config dir you run? + + ```none + A placeholder for the config. + ``` + + ```shell + The command or script you run. + ``` + + ``` + The error message or logs you got, with the full traceback. + ``` + validations: + required: true + + - type: textarea + attributes: + label: Environment + description: | + Please run `python mmyolo/utils/collect_env.py` to collect necessary environment information and paste it here. + You may add addition that may be helpful for locating the problem, such as + - How you installed PyTorch \[e.g., pip, conda, source\] + - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) + validations: + required: true + + - type: textarea + attributes: + label: Expected results + description: If applicable, paste the related results here, e.g., what you expect and what you get. + placeholder: | + ```none + A placeholder for results comparison + ``` + + - type: textarea + attributes: + label: Additional information + description: Tell us anything else you think we should know. + placeholder: | + 1. Did you make any modifications on the code or config? Did you understand what you have modified? + 2. What dataset did you use? + 3. What do you think might be the reason? diff --git a/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/config.yml b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..585c786b50b3692e996a1d150470852e876a24dc --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,9 @@ +blank_issues_enabled: true + +contact_links: + - name: 💬 Forum + url: https://github.com/open-mmlab/mmyolo/discussions + about: Ask general usage questions and discuss with other MMYOLO community members + - name: 🌐 Explore OpenMMLab + url: https://openmmlab.com/ + about: Get know more about OpenMMLab diff --git a/models/YOLO-World/third_party/mmyolo/.github/pull_request_template.md b/models/YOLO-World/third_party/mmyolo/.github/pull_request_template.md new file mode 100644 index 0000000000000000000000000000000000000000..2997d883eec5e36302b7a4505f2d218f5cdf7c91 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/pull_request_template.md @@ -0,0 +1,25 @@ +Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers. + +## Motivation + +Please describe the motivation for this PR and the goal you want to achieve through this PR. + +## Modification + +Please briefly describe what modification is made in this PR. + +## BC-breaking (Optional) + +Does the modification introduce changes that break the backward compatibility of the downstream repos? +If so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR. + +## Use cases (Optional) + +If this PR introduces a new feature, it is better to list some use cases here and update the documentation. + +## Checklist + +1. Pre-commit or other linting tools are used to fix potential lint issues. +2. The modification is covered by complete unit tests. If not, please add more unit tests to ensure the correctness. +3. If the modification has a potential influence on downstream projects, this PR should be tested with downstream projects, like MMDetection or MMClassification. +4. The documentation has been modified accordingly, like docstring or example tutorials. diff --git a/models/YOLO-World/third_party/mmyolo/.github/workflows/deploy.yml b/models/YOLO-World/third_party/mmyolo/.github/workflows/deploy.yml new file mode 100644 index 0000000000000000000000000000000000000000..08f542bbaaae1a1f0f33712544e1ff08c7aa2e85 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.github/workflows/deploy.yml @@ -0,0 +1,28 @@ +name: deploy + +on: push + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-n-publish: + runs-on: ubuntu-latest + if: startsWith(github.event.ref, 'refs/tags') + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install torch + run: pip install torch + - name: Install wheel + run: pip install wheel + - name: Build MMYOLO + run: python setup.py sdist bdist_wheel + - name: Publish distribution to PyPI + run: | + pip install twine + twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }} diff --git a/models/YOLO-World/third_party/mmyolo/.pre-commit-config-zh-cn.yaml b/models/YOLO-World/third_party/mmyolo/.pre-commit-config-zh-cn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..52bb607e86cedc4f0ac9d188bb7ec717d88b35fb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.pre-commit-config-zh-cn.yaml @@ -0,0 +1,60 @@ +exclude: ^tests/data/ +repos: + - repo: https://gitee.com/openmmlab/mirrors-flake8 + rev: 5.0.4 + hooks: + - id: flake8 + - repo: https://gitee.com/openmmlab/mirrors-isort + rev: 5.11.5 + hooks: + - id: isort + - repo: https://gitee.com/openmmlab/mirrors-yapf + rev: v0.32.0 + hooks: + - id: yapf + - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: check-merge-conflict + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://gitee.com/openmmlab/mirrors-mdformat + rev: 0.7.9 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-openmmlab + - mdformat_frontmatter + - linkify-it-py + - repo: https://gitee.com/openmmlab/mirrors-codespell + rev: v2.2.1 + hooks: + - id: codespell + - repo: https://gitee.com/openmmlab/mirrors-docformatter + rev: v1.3.1 + hooks: + - id: docformatter + args: ["--in-place", "--wrap-descriptions", "79"] + - repo: https://gitee.com/openmmlab/mirrors-pyupgrade + rev: v3.0.0 + hooks: + - id: pyupgrade + args: ["--py36-plus"] + - repo: https://github.com/open-mmlab/pre-commit-hooks + rev: v0.2.0 + hooks: + - id: check-copyright + args: ["mmyolo", "tests"] +# - repo: https://gitee.com/openmmlab/mirrors-mypy +# rev: v0.812 +# hooks: +# - id: mypy +# exclude: "docs" diff --git a/models/YOLO-World/third_party/mmyolo/.pre-commit-config.yaml b/models/YOLO-World/third_party/mmyolo/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ffae20d2d3941607fd541e03e22c0e351f296d88 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.pre-commit-config.yaml @@ -0,0 +1,60 @@ +exclude: ^tests/data/ +repos: + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + - repo: https://github.com/PyCQA/isort + rev: 5.11.5 + hooks: + - id: isort + - repo: https://github.com/pre-commit/mirrors-yapf + rev: v0.32.0 + hooks: + - id: yapf + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: check-merge-conflict + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.9 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-openmmlab + - mdformat_frontmatter + - linkify-it-py + - repo: https://github.com/codespell-project/codespell + rev: v2.2.1 + hooks: + - id: codespell + - repo: https://github.com/myint/docformatter + rev: v1.3.1 + hooks: + - id: docformatter + args: ["--in-place", "--wrap-descriptions", "79"] + - repo: https://github.com/asottile/pyupgrade + rev: v3.0.0 + hooks: + - id: pyupgrade + args: ["--py36-plus"] + - repo: https://github.com/open-mmlab/pre-commit-hooks + rev: v0.2.0 + hooks: + - id: check-copyright + args: ["mmyolo", "tests"] +# - repo: https://github.com/pre-commit/mirrors-mypy +# rev: v0.812 +# hooks: +# - id: mypy +# exclude: "docs" diff --git a/models/YOLO-World/third_party/mmyolo/.readthedocs.yml b/models/YOLO-World/third_party/mmyolo/.readthedocs.yml new file mode 100644 index 0000000000000000000000000000000000000000..c9ab01ce18caeebce129472bd63b0465405d6a50 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/.readthedocs.yml @@ -0,0 +1,8 @@ +version: 2 + +formats: all + +python: + version: 3.7 + install: + - requirements: requirements/docs.txt diff --git a/models/YOLO-World/third_party/mmyolo/LICENSE b/models/YOLO-World/third_party/mmyolo/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f288702d2fa16d3cdf0035b15a9fcbc552cd88e7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/models/YOLO-World/third_party/mmyolo/MANIFEST.in b/models/YOLO-World/third_party/mmyolo/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..5bf1d9ebabcc5ca1f28207b62eab10141474db51 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/MANIFEST.in @@ -0,0 +1,6 @@ +include requirements/*.txt +include mmyolo/VERSION +include mmyolo/.mim/model-index.yml +include mmyolo/.mim/demo/*/* +recursive-include mmyolo/.mim/configs *.py *.yml +recursive-include mmyolo/.mim/tools *.sh *.py diff --git a/models/YOLO-World/third_party/mmyolo/README.md b/models/YOLO-World/third_party/mmyolo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b799a759c367938cbeea728b0763a36cda5b2544 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/README.md @@ -0,0 +1,428 @@ +
+ +
 
+
+ OpenMMLab website + + + HOT + + +      + OpenMMLab platform + + + TRY IT OUT + + +
+
 
+ +[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo) +[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/en/latest/) +[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo) +[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE) +[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues) +[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues) + +[📘Documentation](https://mmyolo.readthedocs.io/en/latest/) | +[🛠️Installation](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html) | +[👀Model Zoo](https://mmyolo.readthedocs.io/en/latest/model_zoo.html) | +[🆕Update News](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html) | +[🤔Reporting Issues](https://github.com/open-mmlab/mmyolo/issues/new/choose) + +
+ +
+ +English | [简体中文](README_zh-CN.md) + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +## 📄 Table of Contents + +- [🥳 🚀 What's New](#--whats-new-) + - [✨ Highlight](#-highlight-) +- [📖 Introduction](#-introduction-) +- [🛠️ Installation](#%EF%B8%8F-installation-) +- [👨‍🏫 Tutorial](#-tutorial-) +- [📊 Overview of Benchmark and Model Zoo](#-overview-of-benchmark-and-model-zoo-) +- [❓ FAQ](#-faq-) +- [🙌 Contributing](#-contributing-) +- [🤝 Acknowledgement](#-acknowledgement-) +- [🖊️ Citation](#️-citation-) +- [🎫 License](#-license-) +- [🏗️ Projects in OpenMMLab](#%EF%B8%8F-projects-in-openmmlab-) + +## 🥳 🚀 What's New [🔝](#-table-of-contents) + +💎 **v0.6.0** was released on 15/8/2023: + +- Support YOLOv5 instance segmentation +- Support YOLOX-Pose based on MMPose +- Add 15 minutes instance segmentation tutorial. +- YOLOv5 supports using mask annotation to optimize bbox +- Add Multi-scale training and testing docs + +For release history and update details, please refer to [changelog](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html). + +### ✨ Highlight [🔝](#-table-of-contents) + +We are excited to announce our latest work on real-time object recognition tasks, **RTMDet**, a family of fully convolutional single-stage detectors. RTMDet not only achieves the best parameter-accuracy trade-off on object detection from tiny to extra-large model sizes but also obtains new state-of-the-art performance on instance segmentation and rotated object detection tasks. Details can be found in the [technical report](https://arxiv.org/abs/2212.07784). Pre-trained models are [here](configs/rtmdet). + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + +| Task | Dataset | AP | FPS(TRT FP16 BS1 3090) | +| ------------------------ | ------- | ------------------------------------ | ---------------------- | +| Object Detection | COCO | 52.8 | 322 | +| Instance Segmentation | COCO | 44.6 | 188 | +| Rotated Object Detection | DOTA | 78.9(single-scale)/81.3(multi-scale) | 121 | + +
+ +
+ +MMYOLO currently implements the object detection and rotated object detection algorithm, but it has a significant training acceleration compared to the MMDeteciton version. The training speed is 2.6 times faster than the previous version. + +## 📖 Introduction [🔝](#-table-of-contents) + +MMYOLO is an open source toolbox for YOLO series algorithms based on PyTorch and [MMDetection](https://github.com/open-mmlab/mmdetection). It is a part of the [OpenMMLab](https://openmmlab.com/) project. + +The master branch works with **PyTorch 1.6+**. + + +
+Major features + +- 🕹️ **Unified and convenient benchmark** + + MMYOLO unifies the implementation of modules in various YOLO algorithms and provides a unified benchmark. Users can compare and analyze in a fair and convenient way. + +- 📚 **Rich and detailed documentation** + + MMYOLO provides rich documentation for getting started, model deployment, advanced usages, and algorithm analysis, making it easy for users at different levels to get started and make extensions quickly. + +- 🧩 **Modular Design** + + MMYOLO decomposes the framework into different components where users can easily customize a model by combining different modules with various training and testing strategies. + +BaseModule-P5 + The figure above is contributed by RangeKing@GitHub, thank you very much! + +And the figure of P6 model is in [model_design.md](docs/en/recommended_topics/model_design.md). + +
+ +## 🛠️ Installation [🔝](#-table-of-contents) + +MMYOLO relies on PyTorch, MMCV, MMEngine, and MMDetection. Below are quick steps for installation. Please refer to the [Install Guide](docs/en/get_started/installation.md) for more detailed instructions. + +```shell +conda create -n mmyolo python=3.8 pytorch==1.10.1 torchvision==0.11.2 cudatoolkit=11.3 -c pytorch -y +conda activate mmyolo +pip install openmim +mim install "mmengine>=0.6.0" +mim install "mmcv>=2.0.0rc4,<2.1.0" +mim install "mmdet>=3.0.0,<4.0.0" +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +# Install albumentations +pip install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +``` + +## 👨‍🏫 Tutorial [🔝](#-table-of-contents) + +MMYOLO is based on MMDetection and adopts the same code structure and design approach. To get better use of this, please read [MMDetection Overview](https://mmdetection.readthedocs.io/en/latest/get_started.html) for the first understanding of MMDetection. + +The usage of MMYOLO is almost identical to MMDetection and all tutorials are straightforward to use, you can also learn about [MMDetection User Guide and Advanced Guide](https://mmdetection.readthedocs.io/en/3.x/). + +For different parts from MMDetection, we have also prepared user guides and advanced guides, please read our [documentation](https://mmyolo.readthedocs.io/zenh_CN/latest/). + +
+Get Started + +- [Overview](docs/en/get_started/overview.md) +- [Dependencies](docs/en/get_started/dependencies.md) +- [Installation](docs/en/get_started/installation.md) +- [15 minutes object detection](docs/en/get_started/15_minutes_object_detection.md) +- [15 minutes rotated object detection](docs/en/get_started/15_minutes_rotated_object_detection.md) +- [15 minutes instance segmentation](docs/en/get_started/15_minutes_instance_segmentation.md) +- [Resources summary](docs/en/get_started/article.md) + +
+ +
+Recommended Topics + +- [How to contribute code to MMYOLO](docs/en/recommended_topics/contributing.md) +- [Training testing tricks](docs/en/recommended_topics/training_testing_tricks.md) +- [MMYOLO model design](docs/en/recommended_topics/model_design.md) +- [Algorithm principles and implementation](docs/en/recommended_topics/algorithm_descriptions/) +- [Replace the backbone network](docs/en/recommended_topics/replace_backbone.md) +- [MMYOLO model complexity analysis](docs/en/recommended_topics/complexity_analysis.md) +- [Annotation-to-deployment workflow for custom dataset](docs/en/recommended_topics/labeling_to_deployment_tutorials.md) +- [Visualization](docs/en/recommended_topics/visualization.md) +- [Model deployment](docs/en/recommended_topics/deploy/) +- [Troubleshooting steps](docs/en/recommended_topics/troubleshooting_steps.md) +- [MMYOLO application examples](docs/en/recommended_topics/application_examples/) +- [MM series repo essential basics](docs/en/recommended_topics/mm_basics.md) +- [Dataset preparation and description](docs/en/recommended_topics/dataset_preparation.md) + +
+ +
+Common Usage + +- [Resume training](docs/en/common_usage/resume_training.md) +- [Enabling and disabling SyncBatchNorm](docs/en/common_usage/syncbn.md) +- [Enabling AMP](docs/en/common_usage/amp_training.md) +- [Multi-scale training and testing](docs/en/common_usage/ms_training_testing.md) +- [TTA Related Notes](docs/en/common_usage/tta.md) +- [Add plugins to the backbone network](docs/en/common_usage/plugins.md) +- [Freeze layers](docs/en/common_usage/freeze_layers.md) +- [Output model predictions](docs/en/common_usage/output_predictions.md) +- [Set random seed](docs/en/common_usage/set_random_seed.md) +- [Module combination](docs/en/common_usage/module_combination.md) +- [Cross-library calls using mim](docs/en/common_usage/mim_usage.md) +- [Apply multiple Necks](docs/en/common_usage/multi_necks.md) +- [Specify specific device training or inference](docs/en/common_usage/specify_device.md) +- [Single and multi-channel application examples](docs/en/common_usage/single_multi_channel_applications.md) + +
+ +
+Useful Tools + +- [Browse coco json](docs/en/useful_tools/browse_coco_json.md) +- [Browse dataset](docs/en/useful_tools/browse_dataset.md) +- [Print config](docs/en/useful_tools/print_config.md) +- [Dataset analysis](docs/en/useful_tools/dataset_analysis.md) +- [Optimize anchors](docs/en/useful_tools/optimize_anchors.md) +- [Extract subcoco](docs/en/useful_tools/extract_subcoco.md) +- [Visualization scheduler](docs/en/useful_tools/vis_scheduler.md) +- [Dataset converters](docs/en/useful_tools/dataset_converters.md) +- [Download dataset](docs/en/useful_tools/download_dataset.md) +- [Log analysis](docs/en/useful_tools/log_analysis.md) +- [Model converters](docs/en/useful_tools/model_converters.md) + +
+ +
+Basic Tutorials + +- [Learn about configs with YOLOv5](docs/en/tutorials/config.md) +- [Data flow](docs/en/tutorials/data_flow.md) +- [Rotated detection](docs/en/tutorials/rotated_detection.md) +- [Custom Installation](docs/en/tutorials/custom_installation.md) +- [Common Warning Notes](docs/zh_cn/tutorials/warning_notes.md) +- [FAQ](docs/en/tutorials/faq.md) + +
+ +
+Advanced Tutorials + +- [MMYOLO cross-library application](docs/en/advanced_guides/cross-library_application.md) + +
+ +
+Descriptions + +- [Changelog](docs/en/notes/changelog.md) +- [Compatibility](docs/en/notes/compatibility.md) +- [Conventions](docs/en/notes/conventions.md) +- [Code Style](docs/en/notes/code_style.md) + +
+ +## 📊 Overview of Benchmark and Model Zoo [🔝](#-table-of-contents) + +
+ +
+ +Results and models are available in the [model zoo](docs/en/model_zoo.md). + +
+Supported Tasks + +- [x] Object detection +- [x] Rotated object detection + +
+ +
+Supported Algorithms + +- [x] [YOLOv5](configs/yolov5) +- [ ] [YOLOv5u](configs/yolov5/yolov5u) (Inference only) +- [x] [YOLOX](configs/yolox) +- [x] [RTMDet](configs/rtmdet) +- [x] [RTMDet-Rotated](configs/rtmdet) +- [x] [YOLOv6](configs/yolov6) +- [x] [YOLOv7](configs/yolov7) +- [x] [PPYOLOE](configs/ppyoloe) +- [x] [YOLOv8](configs/yolov8) + +
+ +
+Supported Datasets + +- [x] COCO Dataset +- [x] VOC Dataset +- [x] CrowdHuman Dataset +- [x] DOTA 1.0 Dataset + +
+ +
+
+ Module Components +
+ + + + + + + + + + + + + + + + + +
+ Backbones + + Necks + + Loss + + Common +
+
    +
  • YOLOv5CSPDarknet
  • +
  • YOLOv8CSPDarknet
  • +
  • YOLOXCSPDarknet
  • +
  • EfficientRep
  • +
  • CSPNeXt
  • +
  • YOLOv7Backbone
  • +
  • PPYOLOECSPResNet
  • +
  • mmdet backbone
  • +
  • mmcls backbone
  • +
  • timm
  • +
+
+
    +
  • YOLOv5PAFPN
  • +
  • YOLOv8PAFPN
  • +
  • YOLOv6RepPAFPN
  • +
  • YOLOXPAFPN
  • +
  • CSPNeXtPAFPN
  • +
  • YOLOv7PAFPN
  • +
  • PPYOLOECSPPAFPN
  • +
+
+
    +
  • IoULoss
  • +
  • mmdet loss
  • +
+
+
    +
+
+ +
+ +## ❓ FAQ [🔝](#-table-of-contents) + +Please refer to the [FAQ](docs/en/tutorials/faq.md) for frequently asked questions. + +## 🙌 Contributing [🔝](#-table-of-contents) + +We appreciate all contributions to improving MMYOLO. Ongoing projects can be found in our [GitHub Projects](https://github.com/open-mmlab/mmyolo/projects). Welcome community users to participate in these projects. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline. + +## 🤝 Acknowledgement [🔝](#-table-of-contents) + +MMYOLO is an open source project that is contributed by researchers and engineers from various colleges and companies. We appreciate all the contributors who implement their methods or add new features, as well as users who give valuable feedback. +We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to re-implement existing methods and develop their own new detectors. + +
+ +
+ +## 🖊️ Citation [🔝](#-table-of-contents) + +If you find this project useful in your research, please consider citing: + +```latex +@misc{mmyolo2022, + title={{MMYOLO: OpenMMLab YOLO} series toolbox and benchmark}, + author={MMYOLO Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmyolo}}, + year={2022} +} +``` + +## 🎫 License [🔝](#-table-of-contents) + +This project is released under the [GPL 3.0 license](LICENSE). + +## 🏗️ Projects in OpenMMLab [🔝](#-table-of-contents) + +- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models. +- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision. +- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark. +- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox. +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark. +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark. +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox. +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark. +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark. +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark. +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark. +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark. +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark. +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark. +- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox. +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox. +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework. +- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages. +- [MMEval](https://github.com/open-mmlab/mmeval): OpenMMLab machine learning evaluation library. +- [Playground](https://github.com/open-mmlab/playground): A central hub for gathering and showcasing amazing projects built upon OpenMMLab. diff --git a/models/YOLO-World/third_party/mmyolo/README_zh-CN.md b/models/YOLO-World/third_party/mmyolo/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..6eb4d95fe5c6d013d677482762d722b20ce826f0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/README_zh-CN.md @@ -0,0 +1,468 @@ +
+ +
 
+
+ OpenMMLab 官网 + + + HOT + + +      + OpenMMLab 开放平台 + + + TRY IT OUT + + +
+
 
+ +[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo) +[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/zh_CN/latest/) +[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo) +[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE) +[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues) +[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues) + +[📘使用文档](https://mmyolo.readthedocs.io/zh_CN/latest/) | +[🛠️安装教程](https://mmyolo.readthedocs.io/zh_CN/latest/get_started/installation.html) | +[👀模型库](https://mmyolo.readthedocs.io/zh_CN/latest/model_zoo.html) | +[🆕更新日志](https://mmyolo.readthedocs.io/zh_CN/latest/notes/changelog.html) | +[🤔报告问题](https://github.com/open-mmlab/mmyolo/issues/new/choose) + +
+ +
+ +[English](README.md) | 简体中文 + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +## 📄 Table of Contents + +- [🥳 🚀 最新进展](#--最新进展-) + - [✨ 亮点](#-亮点-) +- [📖 简介](#-简介-) +- [🛠️ 安装](#️%EF%B8%8F-安装-) +- [👨‍🏫 教程](#-教程-) +- [📊 基准测试和模型库](#-基准测试和模型库-) +- [❓ 常见问题](#-常见问题-) +- [🙌 贡献指南](#-贡献指南-) +- [🤝 致谢](#🤝-致谢-) +- [🖊️ 引用](#️-引用-) +- [🎫 开源许可证](#-开源许可证-) +- [🏗️ OpenMMLab 的其他项目](#%EF%B8%8F-openmmlab-的其他项目-) +- [❤️ 欢迎加入 OpenMMLab 社区](#%EF%B8%8F-欢迎加入-openmmlab-社区-) + +## 🥳 🚀 最新进展 [🔝](#-table-of-contents) + +💎 **v0.6.0** 版本已经在 2023.8.15 发布: + +- 支持 YOLOv5 实例分割 +- 基于 MMPose 支持 YOLOX-Pose +- 添加 15 分钟的实例分割教程 +- YOLOv5 支持使用 mask 标注来优化边界框 +- 添加多尺度训练和测试文档 + +我们提供了实用的**脚本命令速查表** + +
+ +
+ +你可以点击[链接](https://pan.baidu.com/s/1QEaqT7YayUdEvh1an0gjHg?pwd=yolo),下载高清版 PDF 文件。 + +同时我们也推出了解读视频: + +| | 内容 | 视频 | 课程中的代码 | +| :-: | :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 🌟 | 特征图可视化 | [![Link](https://i2.hdslb.com/bfs/archive/480a0eb41fce26e0acb65f82a74501418eee1032.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV188411s7o8) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV188411s7o8)](https://www.bilibili.com/video/BV188411s7o8) | [特征图可视化.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/%5B%E5%B7%A5%E5%85%B7%E7%B1%BB%E7%AC%AC%E4%B8%80%E6%9C%9F%5D%E7%89%B9%E5%BE%81%E5%9B%BE%E5%8F%AF%E8%A7%86%E5%8C%96.ipynb) | +| 🌟 | 源码阅读和调试「必备」技巧 | [![Link](https://i2.hdslb.com/bfs/archive/790d2422c879ff20488910da1c4422b667ea6af7.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1N14y1V7mB) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1N14y1V7mB)](https://www.bilibili.com/video/BV1N14y1V7mB) | [源码阅读和调试「必备」技巧文档](https://zhuanlan.zhihu.com/p/580885852) | +| 🌟 | 10分钟换遍主干网络 | [![Link](http://i0.hdslb.com/bfs/archive/c51f1aef7c605856777249a7b4478f44bd69f3bd.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1JG4y1d7GC) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1JG4y1d7GC)](https://www.bilibili.com/video/BV1JG4y1d7GC) | [10分钟换遍主干网络文档](https://zhuanlan.zhihu.com/p/585641598)
[10分钟换遍主干网络.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[实用类第二期]10分钟换遍主干网络.ipynb) | +| 🌟 | 自定义数据集从标注到部署保姆级教程 | [![Link](https://i2.hdslb.com/bfs/archive/13f566c89a18c9c881713b63ec14da952d4c0b14.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1RG4y137i5) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1RG4y137i5)](https://www.bilibili.com/video/BV1JG4y1d7GC) | [自定义数据集从标注到部署保姆级教程](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/user_guides/custom_dataset.md) | +| 🌟 | 顶会第一步 · 模块自定义 | [![Link](http://i2.hdslb.com/bfs/archive/5b23d41ac57466824eaf185ef806ef734414e93b.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1yd4y1j7VD) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1yd4y1j7VD)](https://www.bilibili.com/video/BV1yd4y1j7VD) | [顶会第一步·模块自定义.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[实用类第四期]顶会第一步·模块自定义.ipynb) | + +完整视频列表请参考 [中文解读资源汇总 - 视频](https://mmyolo.readthedocs.io/zh_CN/latest/get_started/article.html) + +发布历史和更新细节请参考 [更新日志](https://mmyolo.readthedocs.io/zh_CN/latest/notes/changelog.html) + +### ✨ 亮点 [🔝](#-table-of-contents) + +我们很高兴向大家介绍我们在实时目标识别任务方面的最新成果 RTMDet,包含了一系列的全卷积单阶段检测模型。 RTMDet 不仅在从 tiny 到 extra-large 尺寸的目标检测模型上实现了最佳的参数量和精度的平衡,而且在实时实例分割和旋转目标检测任务上取得了最先进的成果。 更多细节请参阅[技术报告](https://arxiv.org/abs/2212.07784)。 预训练模型可以在[这里](configs/rtmdet)找到。 + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + +| Task | Dataset | AP | FPS(TRT FP16 BS1 3090) | +| ------------------------ | ------- | ------------------------------------ | ---------------------- | +| Object Detection | COCO | 52.8 | 322 | +| Instance Segmentation | COCO | 44.6 | 188 | +| Rotated Object Detection | DOTA | 78.9(single-scale)/81.3(multi-scale) | 121 | + +
+ +
+ +MMYOLO 中目前实现了目标检测和旋转框目标检测算法,但是相比 MMDeteciton 版本有显著训练加速,训练速度相比原先版本提升 2.6 倍。 + +## 📖 简介 [🔝](#-table-of-contents) + +MMYOLO 是一个基于 PyTorch 和 MMDetection 的 YOLO 系列算法开源工具箱。它是 [OpenMMLab](https://openmmlab.com/) 项目的一部分。 + +主分支代码目前支持 PyTorch 1.6 以上的版本。 + + +
+主要特性 + +- 🕹️ **统一便捷的算法评测** + + MMYOLO 统一了各类 YOLO 算法模块的实现, 并提供了统一的评测流程,用户可以公平便捷地进行对比分析。 + +- 📚 **丰富的入门和进阶文档** + + MMYOLO 提供了从入门到部署到进阶和算法解析等一系列文档,方便不同用户快速上手和扩展。 + +- 🧩 **模块化设计** + + MMYOLO 将框架解耦成不同的模块组件,通过组合不同的模块和训练测试策略,用户可以便捷地构建自定义模型。 + +基类-P5 + 图为 RangeKing@GitHub 提供,非常感谢! + +P6 模型图详见 [model_design.md](docs/zh_cn/recommended_topics/model_design.md)。 + +
+ +## 🛠️ 安装 [🔝](#-table-of-contents) + +MMYOLO 依赖 PyTorch, MMCV, MMEngine 和 MMDetection,以下是安装的简要步骤。 更详细的安装指南请参考[安装文档](docs/zh_cn/get_started/installation.md)。 + +```shell +conda create -n mmyolo python=3.8 pytorch==1.10.1 torchvision==0.11.2 cudatoolkit=11.3 -c pytorch -y +conda activate mmyolo +pip install openmim +mim install "mmengine>=0.6.0" +mim install "mmcv>=2.0.0rc4,<2.1.0" +mim install "mmdet>=3.0.0,<4.0.0" +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +# Install albumentations +pip install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +``` + +## 👨‍🏫 教程 [🔝](#-table-of-contents) + +MMYOLO 基于 MMDetection 开源库,并且采用相同的代码组织和设计方式。为了更好的使用本开源库,请先阅读 [MMDetection 概述](https://mmdetection.readthedocs.io/zh_CN/latest/get_started.html) 对 MMDetection 进行初步地了解。 + +MMYOLO 用法和 MMDetection 几乎一致,所有教程都是通用的,你也可以了解 [MMDetection 用户指南和进阶指南](https://mmdetection.readthedocs.io/zh_CN/3.x/) 。 + +针对和 MMDetection 不同的部分,我们也准备了用户指南和进阶指南,请阅读我们的 [文档](https://mmyolo.readthedocs.io/zh_CN/latest/) 。 + +
+开启 MMYOLO 之旅 + +- [概述](docs/zh_cn/get_started/overview.md) +- [依赖](docs/zh_cn/get_started/dependencies.md) +- [安装和验证](docs/zh_cn/get_started/installation.md) +- [15 分钟上手 MMYOLO 目标检测](docs/zh_cn/get_started/15_minutes_object_detection.md) +- [15 分钟上手 MMYOLO 旋转框目标检测](docs/zh_cn/get_started/15_minutes_rotated_object_detection.md) +- [15 分钟上手 MMYOLO 实例分割](docs/zh_cn/get_started/15_minutes_instance_segmentation.md) +- [中文解读资源汇总](docs/zh_cn/get_started/article.md) + +
+ +
+推荐专题 + +- [如何给 MMYOLO 贡献代码](docs/zh_cn/recommended_topics/contributing.md) +- [训练和测试技巧](docs/zh_cn/recommended_topics/training_testing_tricks.md) +- [MMYOLO 模型结构设计](docs/zh_cn/recommended_topics/model_design.md) +- [原理和实现全解析](docs/zh_cn/recommended_topics/algorithm_descriptions/) +- [轻松更换主干网络](docs/zh_cn/recommended_topics/replace_backbone.md) +- [MMYOLO 模型复杂度分析](docs/zh_cn/recommended_topics/complexity_analysis.md) +- [标注+训练+测试+部署全流程](docs/zh_cn/recommended_topics/labeling_to_deployment_tutorials.md) +- [关于可视化的一切](docs/zh_cn/recommended_topics/visualization.md) +- [模型部署流程](docs/zh_cn/recommended_topics/deploy/) +- [常见错误排查步骤](docs/zh_cn/recommended_topics/troubleshooting_steps.md) +- [MMYOLO 应用范例介绍](docs/zh_cn/recommended_topics/application_examples/) +- [MM 系列 Repo 必备基础](docs/zh_cn/recommended_topics/mm_basics.md) +- [数据集准备和说明](docs/zh_cn/recommended_topics/dataset_preparation.md) + +
+ +
+常用功能 + +- [恢复训练](docs/zh_cn/common_usage/resume_training.md) +- [开启和关闭 SyncBatchNorm](docs/zh_cn/common_usage/syncbn.md) +- [开启混合精度训练](docs/zh_cn/common_usage/amp_training.md) +- [多尺度训练和测试](docs/zh_cn/common_usage/ms_training_testing.md) +- [测试时增强相关说明](docs/zh_cn/common_usage/tta.md) +- [给主干网络增加插件](docs/zh_cn/common_usage/plugins.md) +- [冻结指定网络层权重](docs/zh_cn/common_usage/freeze_layers.md) +- [输出模型预测结果](docs/zh_cn/common_usage/output_predictions.md) +- [设置随机种子](docs/zh_cn/common_usage/set_random_seed.md) +- [算法组合替换教程](docs/zh_cn/common_usage/module_combination.md) +- [使用 mim 跨库调用其他 OpenMMLab 仓库的脚本](docs/zh_cn/common_usage/mim_usage.md) +- [应用多个 Neck](docs/zh_cn/common_usage/multi_necks.md) +- [指定特定设备训练或推理](docs/zh_cn/common_usage/specify_device.md) +- [单通道和多通道应用案例](docs/zh_cn/common_usage/single_multi_channel_applications.md) +- [MM 系列开源库注册表](docs/zh_cn/common_usage/registries_info.md) + +
+ +
+实用工具 + +- [可视化 COCO 标签](docs/zh_cn/useful_tools/browse_coco_json.md) +- [可视化数据集](docs/zh_cn/useful_tools/browse_dataset.md) +- [打印完整配置文件](docs/zh_cn/useful_tools/print_config.md) +- [可视化数据集分析结果](docs/zh_cn/useful_tools/dataset_analysis.md) +- [优化锚框尺寸](docs/zh_cn/useful_tools/optimize_anchors.md) +- [提取 COCO 子集](docs/zh_cn/useful_tools/extract_subcoco.md) +- [可视化优化器参数策略](docs/zh_cn/useful_tools/vis_scheduler.md) +- [数据集转换](docs/zh_cn/useful_tools/dataset_converters.md) +- [数据集下载](docs/zh_cn/useful_tools/download_dataset.md) +- [日志分析](docs/zh_cn/useful_tools/log_analysis.md) +- [模型转换](docs/zh_cn/useful_tools/model_converters.md) + +
+ +
+基础教程 + +- [学习 YOLOv5 配置文件](docs/zh_cn/tutorials/config.md) +- [数据流](docs/zh_cn/tutorials/data_flow.md) +- [旋转目标检测](docs/zh_cn/tutorials/rotated_detection.md) +- [自定义安装](docs/zh_cn/tutorials/custom_installation.md) +- [常见警告说明](docs/zh_cn/tutorials/warning_notes.md) +- [常见问题](docs/zh_cn/tutorials/faq.md) + +
+ +
+进阶教程 + +- [MMYOLO 跨库应用解析](docs/zh_cn/advanced_guides/cross-library_application.md) + +
+ +
+说明 + +- [更新日志](docs/zh_cn/notes/changelog.md) +- [兼容性说明](docs/zh_cn/notes/compatibility.md) +- [默认约定](docs/zh_cn/notes/conventions.md) +- [代码规范](docs/zh_cn/notes/code_style.md) + +
+ +## 📊 基准测试和模型库 [🔝](#-table-of-contents) + +
+ +
+ +测试结果和模型可以在 [模型库](docs/zh_cn/model_zoo.md) 中找到。 + +
+支持的任务 + +- [x] 目标检测 +- [x] 旋转框目标检测 + +
+ +
+支持的算法 + +- [x] [YOLOv5](configs/yolov5) +- [ ] [YOLOv5u](configs/yolov5/yolov5u) (仅推理) +- [x] [YOLOX](configs/yolox) +- [x] [RTMDet](configs/rtmdet) +- [x] [RTMDet-Rotated](configs/rtmdet) +- [x] [YOLOv6](configs/yolov6) +- [x] [YOLOv7](configs/yolov7) +- [x] [PPYOLOE](configs/ppyoloe) +- [x] [YOLOv8](configs/yolov8) + +
+ +
+支持的数据集 + +- [x] COCO Dataset +- [x] VOC Dataset +- [x] CrowdHuman Dataset +- [x] DOTA 1.0 Dataset + +
+ +
+
+ 模块组件 +
+ + + + + + + + + + + + + + + + + +
+ Backbones + + Necks + + Loss + + Common +
+
    +
  • YOLOv5CSPDarknet
  • +
  • YOLOv8CSPDarknet
  • +
  • YOLOXCSPDarknet
  • +
  • EfficientRep
  • +
  • CSPNeXt
  • +
  • YOLOv7Backbone
  • +
  • PPYOLOECSPResNet
  • +
  • mmdet backbone
  • +
  • mmcls backbone
  • +
  • timm
  • +
+
+
    +
  • YOLOv5PAFPN
  • +
  • YOLOv8PAFPN
  • +
  • YOLOv6RepPAFPN
  • +
  • YOLOXPAFPN
  • +
  • CSPNeXtPAFPN
  • +
  • YOLOv7PAFPN
  • +
  • PPYOLOECSPPAFPN
  • +
+
+
    +
  • IoULoss
  • +
  • mmdet loss
  • +
+
+
    +
+
+ +
+ +## ❓ 常见问题 [🔝](#-table-of-contents) + +请参考 [FAQ](docs/zh_cn/tutorials/faq.md) 了解其他用户的常见问题。 + +## 🙌 贡献指南 [🔝](#-table-of-contents) + +我们感谢所有的贡献者为改进和提升 MMYOLO 所作出的努力。我们将正在进行中的项目添加进了[GitHub Projects](https://github.com/open-mmlab/mmyolo/projects)页面,非常欢迎社区用户能参与进这些项目中来。请参考[贡献指南](.github/CONTRIBUTING.md)来了解参与项目贡献的相关指引。 + +## 🤝 致谢 [🔝](#-table-of-contents) + +MMYOLO 是一款由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者,以及提供宝贵反馈的用户。 我们希望这个工具箱和基准测试可以为社区提供灵活的代码工具,供用户复现已有算法并开发自己的新模型,从而不断为开源社区提供贡献。 + +
+ +
+ +## 🖊️ 引用 [🔝](#-table-of-contents) + +如果你觉得本项目对你的研究工作有所帮助,请参考如下 bibtex 引用 MMYOLO + +```latex +@misc{mmyolo2022, + title={{MMYOLO: OpenMMLab YOLO} series toolbox and benchmark}, + author={MMYOLO Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmyolo}}, + year={2022} +} +``` + +## 🎫 开源许可证 [🔝](#-table-of-contents) + +该项目采用 [GPL 3.0 开源许可证](LICENSE)。 + +## 🏗️ OpenMMLab 的其他项目 [🔝](#-table-of-contents) + +- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库 +- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库 +- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab 深度学习预训练工具箱 +- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab 新一代人工智能内容生成(AIGC)工具箱 +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱 +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台 +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准 +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱 +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱 +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包 +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱 +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准 +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准 +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准 +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准 +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱 +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台 +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准 +- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱 +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱 +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架 +- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口 +- [MMEval](https://github.com/open-mmlab/mmeval): OpenMMLab 机器学习算法评测库 +- [Playground](https://github.com/open-mmlab/playground): 收集和展示 OpenMMLab 相关的前沿、有趣的社区项目 + +## ❤️ 欢迎加入 OpenMMLab 社区 [🔝](#-table-of-contents) + +扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab),加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=aCvMxdr3) + +
+ +
+ +我们会在 OpenMMLab 社区为大家 + +- 📢 分享 AI 框架的前沿核心技术 +- 💻 解读 PyTorch 常用模块源码 +- 📰 发布 OpenMMLab 的相关新闻 +- 🚀 介绍 OpenMMLab 开发的前沿算法 +- 🏃 获取更高效的问题答疑和意见反馈 +- 🔥 提供与各行各业开发者充分交流的平台 + +干货满满 📘,等你来撩 💗,OpenMMLab 社区期待您的加入 👬 diff --git a/models/YOLO-World/third_party/mmyolo/configs/_base_/default_runtime.py b/models/YOLO-World/third_party/mmyolo/configs/_base_/default_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..098f220573cf481056f2f55f0621198270d51c49 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/_base_/default_runtime.py @@ -0,0 +1,43 @@ +default_scope = 'mmyolo' + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='mmdet.DetLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) + +log_level = 'INFO' +load_from = None +resume = False + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions +# before MMDet 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) + +backend_args = None diff --git a/models/YOLO-World/third_party/mmyolo/configs/_base_/det_p5_tta.py b/models/YOLO-World/third_party/mmyolo/configs/_base_/det_p5_tta.py new file mode 100644 index 0000000000000000000000000000000000000000..8df0d5ea8db46fe748cc8fe1074aa928c64b4309 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/_base_/det_p5_tta.py @@ -0,0 +1,58 @@ +# TODO: Need to solve the problem of multiple backend_args parameters +# _backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) + +_backend_args = None + +tta_model = dict( + type='mmdet.DetTTAModel', + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=300)) + +img_scales = [(640, 640), (320, 320), (960, 960)] + +# LoadImageFromFile +# / | \ +# (RatioResize,LetterResize) (RatioResize,LetterResize) (RatioResize,LetterResize) # noqa +# / \ / \ / \ +# RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip # noqa +# | | | | | | +# LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn +# | | | | | | +# PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn # noqa + +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_backend_args), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/_base_/pose/coco.py b/models/YOLO-World/third_party/mmyolo/configs/_base_/pose/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..865a95bc02fedd318f32d2e7aa8397147d78fdb5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/_base_/pose/coco.py @@ -0,0 +1,181 @@ +dataset_info = dict( + dataset_name='coco', + paper_info=dict( + author='Lin, Tsung-Yi and Maire, Michael and ' + 'Belongie, Serge and Hays, James and ' + 'Perona, Pietro and Ramanan, Deva and ' + r'Doll{\'a}r, Piotr and Zitnick, C Lawrence', + title='Microsoft coco: Common objects in context', + container='European conference on computer vision', + year='2014', + homepage='http://cocodataset.org/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/base_dynamic.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/base_dynamic.py new file mode 100644 index 0000000000000000000000000000000000000000..747c21fd2bf0523c7d1e2ace67cff3f3d6612c2a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/base_dynamic.py @@ -0,0 +1,17 @@ +_base_ = ['./base_static.py'] +onnx_config = dict( + dynamic_axes={ + 'input': { + 0: 'batch', + 2: 'height', + 3: 'width' + }, + 'dets': { + 0: 'batch', + 1: 'num_dets' + }, + 'labels': { + 0: 'batch', + 1: 'num_dets' + } + }) diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/base_static.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/base_static.py new file mode 100644 index 0000000000000000000000000000000000000000..dee01dd5dde1185b5e156b036f72fb3ccb0bf5bc --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/base_static.py @@ -0,0 +1,23 @@ +onnx_config = dict( + type='onnx', + export_params=True, + keep_initializers_as_inputs=False, + opset_version=11, + save_file='end2end.onnx', + input_names=['input'], + output_names=['dets', 'labels'], + input_shape=None, + optimize=True) +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_onnxruntime_dynamic.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_onnxruntime_dynamic.py new file mode 100644 index 0000000000000000000000000000000000000000..14f4a12115f403fb4d091db9c07f925ba2ad83ec --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_onnxruntime_dynamic.py @@ -0,0 +1,15 @@ +_base_ = ['./base_dynamic.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_onnxruntime_static.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_onnxruntime_static.py new file mode 100644 index 0000000000000000000000000000000000000000..3eac8ca75715b711bdf03784dbb977a81bf444d3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_onnxruntime_static.py @@ -0,0 +1,15 @@ +_base_ = ['./base_static.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_rknn-fp16_static-320x320.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_rknn-fp16_static-320x320.py new file mode 100644 index 0000000000000000000000000000000000000000..b7bd31331ebae8374dc06f9ed4e0e82a3204e36f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_rknn-fp16_static-320x320.py @@ -0,0 +1,9 @@ +_base_ = ['./base_static.py'] +onnx_config = dict( + input_shape=[320, 320], output_names=['feat0', 'feat1', 'feat2']) +codebase_config = dict(model_type='rknn') +backend_config = dict( + type='rknn', + common_config=dict(target_platform='rv1126', optimization_level=1), + quantization_config=dict(do_quantization=False, dataset=None), + input_size_list=[[3, 320, 320]]) diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_rknn-int8_static-320x320.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_rknn-int8_static-320x320.py new file mode 100644 index 0000000000000000000000000000000000000000..10c96b2f26d27be28b384612d9ae8ee2cae84983 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_rknn-int8_static-320x320.py @@ -0,0 +1,9 @@ +_base_ = ['./base_static.py'] +onnx_config = dict( + input_shape=[320, 320], output_names=['feat0', 'feat1', 'feat2']) +codebase_config = dict(model_type='rknn') +backend_config = dict( + type='rknn', + common_config=dict(target_platform='rv1126', optimization_level=1), + quantization_config=dict(do_quantization=True, dataset=None), + input_size_list=[[3, 320, 320]]) diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py new file mode 100644 index 0000000000000000000000000000000000000000..da565b6c341add02a74579a734eb4cb123847e6d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py @@ -0,0 +1,13 @@ +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=True, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py new file mode 100644 index 0000000000000000000000000000000000000000..bad8521afa6ebd4f9bb24a137b66fd1c66668361 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py @@ -0,0 +1,13 @@ +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=True, max_workspace_size=1 << 32), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 64, 64], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 1344, 1344]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_static-640x640.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_static-640x640.py new file mode 100644 index 0000000000000000000000000000000000000000..24d2a00d9340b2e3cd3392ab2881b68cccd75e8a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_static-640x640.py @@ -0,0 +1,14 @@ +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=True, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py new file mode 100644 index 0000000000000000000000000000000000000000..21591c4d4e72a867392adf9c49cd60c6bb994e35 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py @@ -0,0 +1,15 @@ +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict( + fp16_mode=True, max_workspace_size=1 << 30, int8_mode=True), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +calib_config = dict(create_calib=True, calib_file='calib_data.h5') +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_static-640x640.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_static-640x640.py new file mode 100644 index 0000000000000000000000000000000000000000..ac394a6b3f854a0d23a1d37ff07d87c523c9784a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_static-640x640.py @@ -0,0 +1,16 @@ +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict( + fp16_mode=True, max_workspace_size=1 << 30, int8_mode=True), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +calib_config = dict(create_calib=True, calib_file='calib_data.h5') +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py new file mode 100644 index 0000000000000000000000000000000000000000..17047d7380043da537f2f6029bb4373986062c04 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py @@ -0,0 +1,13 @@ +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt_static-640x640.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt_static-640x640.py new file mode 100644 index 0000000000000000000000000000000000000000..9ec49cc114cc0025310766be17bb5c45af56c516 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/detection_tensorrt_static-640x640.py @@ -0,0 +1,14 @@ +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/model/yolov5_s-static.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/model/yolov5_s-static.py new file mode 100644 index 0000000000000000000000000000000000000000..11b7f6a040271f4c82fce8e8240b23ad54fd18c7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/model/yolov5_s-static.py @@ -0,0 +1,19 @@ +_base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + use_mini_pad=False, + ), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/deploy/model/yolov6_s-static.py b/models/YOLO-World/third_party/mmyolo/configs/deploy/model/yolov6_s-static.py new file mode 100644 index 0000000000000000000000000000000000000000..4f64438ca3d3ba1699e514bc2c8ee900d5095d4d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/deploy/model/yolov6_s-static.py @@ -0,0 +1,19 @@ +_base_ = '../../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + use_mini_pad=False, + ), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/README.md b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/README.md new file mode 100644 index 0000000000000000000000000000000000000000..70a5b2055bbbc79cc6e4817cc3d936780b09f73e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/README.md @@ -0,0 +1,43 @@ +# PPYOLOE + + + +## Abstract + +PP-YOLOE is an excellent single-stage anchor-free model based on PP-YOLOv2, surpassing a variety of popular YOLO models. PP-YOLOE has a series of models, named s/m/l/x, which are configured through width multiplier and depth multiplier. PP-YOLOE avoids using special operators, such as Deformable Convolution or Matrix NMS, to be deployed friendly on various hardware. + +
+ +
+ +
+ +PPYOLOE-PLUS-l model structure +
+ +## Results and models + +### PPYOLOE+ COCO + +| Backbone | Arch | Size | Epoch | SyncBN | Mem (GB) | Box AP | Config | Download | +| :---------: | :--: | :--: | :---: | :----: | :------: | :----: | :----------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| PPYOLOE+ -s | P5 | 640 | 80 | Yes | 4.7 | 43.5 | [config](./ppyoloe_plus_s_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052.log.json) | +| PPYOLOE+ -m | P5 | 640 | 80 | Yes | 8.4 | 49.5 | [config](./ppyoloe_plus_m_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132-e4325ada.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132.log.json) | +| PPYOLOE+ -l | P5 | 640 | 80 | Yes | 13.2 | 52.6 | [config](./ppyoloe_plus_l_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825-1864e7b3.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825.log.json) | +| PPYOLOE+ -x | P5 | 640 | 80 | Yes | 19.1 | 54.2 | [config](./ppyoloe_plus_x_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921-8c953949.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921.log.json) | + +**Note**: + +1. The above Box APs are all models with the best performance in COCO +2. The gap between the above performance and the official release is about 0.3. To speed up training in mmyolo, we use pytorch to implement the image resizing in `PPYOLOEBatchRandomResize` for multi-scale training, while official PPYOLOE use opencv. And `lanczos4` is not yet supported in `PPYOLOEBatchRandomResize`. The above two reasons lead to the gap. We will continue to experiment and address the gap in future releases. +3. The mAP of the non-Plus version needs more verification, and we will update more details of the non-Plus version in future versions. + +```latex +@article{Xu2022PPYOLOEAE, + title={PP-YOLOE: An evolved version of YOLO}, + author={Shangliang Xu and Xinxin Wang and Wenyu Lv and Qinyao Chang and Cheng Cui and Kaipeng Deng and Guanzhong Wang and Qingqing Dang and Shengyun Wei and Yuning Du and Baohua Lai}, + journal={ArXiv}, + year={2022}, + volume={abs/2203.16250} +} +``` diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/metafile.yml b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..5b7ed9487b60afecbd9db87f0ad89d9b3be8c93d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/metafile.yml @@ -0,0 +1,69 @@ +Collections: + - Name: PPYOLOE + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - PPYOLOECSPResNet + - PPYOLOECSPPAFPN + Paper: + URL: https://arxiv.org/abs/2203.16250 + Title: 'PP-YOLOE: An evolved version of YOLO' + README: configs/ppyoloe/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.0.1/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.0.1 + +Models: + - Name: ppyoloe_plus_s_fast_8xb8-80e_coco + In Collection: PPYOLOE + Config: configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py + Metadata: + Training Memory (GB): 4.7 + Epochs: 80 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + Weights: https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth + - Name: ppyoloe_plus_m_fast_8xb8-80e_coco + In Collection: PPYOLOE + Config: configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py + Metadata: + Training Memory (GB): 8.4 + Epochs: 80 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.5 + Weights: https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132-e4325ada.pth + - Name: ppyoloe_plus_L_fast_8xb8-80e_coco + In Collection: PPYOLOE + Config: configs/ppyoloe/ppyoloe_plus_L_fast_8xb8-80e_coco.py + Metadata: + Training Memory (GB): 13.2 + Epochs: 80 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.6 + Weights: https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825-1864e7b3.pth + - Name: ppyoloe_plus_x_fast_8xb8-80e_coco + In Collection: PPYOLOE + Config: configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py + Metadata: + Training Memory (GB): 19.1 + Epochs: 80 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 54.2 + Weights: https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921-8c953949.pth diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_l_fast_8xb20-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_l_fast_8xb20-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ef1b4eaae7240e07a5e8450f35b6f71f2271e09f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_l_fast_8xb20-300e_coco.py @@ -0,0 +1,23 @@ +_base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_l_imagenet1k_pretrained-c0010e6c.pth' # noqa + +deepen_factor = 1.0 +widen_factor = 1.0 + +train_batch_size_per_gpu = 20 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_m_fast_8xb28-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_m_fast_8xb28-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..abcfd7833016164fbef84a70366b958f28ea6648 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_m_fast_8xb28-300e_coco.py @@ -0,0 +1,23 @@ +_base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_m_imagenet1k_pretrained-09f1eba2.pth' # noqa + +deepen_factor = 0.67 +widen_factor = 0.75 + +train_batch_size_per_gpu = 28 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..9db53e26f4168e82b6cd760e1b8f41c0bebfae8f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco.py @@ -0,0 +1,16 @@ +_base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_l_obj365_pretrained-3dd89562.pth' # noqa + +deepen_factor = 1.0 +widen_factor = 1.0 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..17cb33556f7ff111a4d702e6798abda1aaafeb01 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py @@ -0,0 +1,16 @@ +_base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_m_ojb365_pretrained-03206892.pth' # noqa + +deepen_factor = 0.67 +widen_factor = 0.75 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_1xb12-40e_cat.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..752ff63388cee00156dc729b68242eae68e4d052 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_1xb12-40e_cat.py @@ -0,0 +1,56 @@ +# Compared to other same scale models, this configuration consumes too much +# GPU memory and is not validated for now +_base_ = 'ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +num_last_epochs = 5 + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 2 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +default_hooks = dict( + param_scheduler=dict( + warmup_min_iter=10, + warmup_epochs=3, + total_epochs=int(max_epochs * 1.2))) + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3d98252ccaec23c75b3e8aa3ddb095ee85010bd8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py @@ -0,0 +1,239 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# dataset settings +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' + +# parameters that often need to be modified +img_scale = (640, 640) # width, height +deepen_factor = 0.33 +widen_factor = 0.5 +max_epochs = 80 +num_classes = 80 +save_epoch_intervals = 5 +train_batch_size_per_gpu = 8 +train_num_workers = 8 +val_batch_size_per_gpu = 1 +val_num_workers = 2 + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_s_obj365_pretrained-bcfe8478.pth' # noqa + +# persistent_workers must be False if num_workers is 0. +persistent_workers = True + +# Base learning rate for optim_wrapper +base_lr = 0.001 + +strides = [8, 16, 32] + +model = dict( + type='YOLODetector', + data_preprocessor=dict( + # use this to support multi_scale training + type='PPYOLOEDetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='PPYOLOEBatchRandomResize', + random_size_range=(320, 800), + interval=1, + size_divisor=32, + random_interp=True, + keep_ratio=False) + ], + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='PPYOLOECSPResNet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + block_cfg=dict( + type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True), + norm_cfg=dict(type='BN', momentum=0.1, eps=1e-5), + act_cfg=dict(type='SiLU', inplace=True), + attention_cfg=dict( + type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')), + use_large_stem=True), + neck=dict( + type='PPYOLOECSPPAFPN', + in_channels=[256, 512, 1024], + out_channels=[192, 384, 768], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csplayer=1, + num_blocks_per_layer=3, + block_cfg=dict( + type='PPYOLOEBasicBlock', shortcut=False, use_alpha=False), + norm_cfg=dict(type='BN', momentum=0.1, eps=1e-5), + act_cfg=dict(type='SiLU', inplace=True), + drop_block_cfg=None, + use_spp=True), + bbox_head=dict( + type='PPYOLOEHead', + head_module=dict( + type='PPYOLOEHeadModule', + num_classes=num_classes, + in_channels=[192, 384, 768], + widen_factor=widen_factor, + featmap_strides=strides, + reg_max=16, + norm_cfg=dict(type='BN', momentum=0.1, eps=1e-5), + act_cfg=dict(type='SiLU', inplace=True), + num_base_priors=1), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0.5, strides=strides), + bbox_coder=dict(type='DistancePointBBoxCoder'), + loss_cls=dict( + type='mmdet.VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + reduction='sum', + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False), + # Since the dflloss is implemented differently in the official + # and mmdet, we're going to divide loss_weight by 4. + loss_dfl=dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=0.5 / 4)), + train_cfg=dict( + initial_epoch=30, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=num_classes, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + topk=13, + alpha=1, + beta=6, + eps=1e-9)), + test_cfg=dict( + multi_label=True, + nms_pre=1000, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.7), + max_per_img=300)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='PPYOLOERandomDistort'), + dict(type='mmdet.Expand', mean=(103.53, 116.28, 123.675)), + dict(type='PPYOLOERandomCrop'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='yolov5_collate', use_ms_training=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=0), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='mmdet.FixShapeResize', + width=img_scale[0], + height=img_scale[1], + keep_ratio=False, + interpolation='bicubic'), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=0), + ann_file='annotations/instances_val2017.json', + pipeline=test_pipeline)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.9, + weight_decay=5e-4, + nesterov=False), + paramwise_cfg=dict(norm_decay_mult=0.)) + +default_hooks = dict( + param_scheduler=dict( + type='PPYOLOEParamSchedulerHook', + warmup_min_iter=1000, + start_factor=0., + warmup_epochs=5, + min_lr_ratio=0.0, + total_epochs=int(max_epochs * 1.2)), + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + save_best='auto', + max_keep_ckpts=3)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b8e61120bee63c67da1ae31e492709381b365b47 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py @@ -0,0 +1,16 @@ +_base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_x_obj365_pretrained-43a8000d.pth' # noqa + +deepen_factor = 1.33 +widen_factor = 1.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..622332899cd4f8589559ed3484fb5affb6a7963b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-300e_coco.py @@ -0,0 +1,36 @@ +_base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_s_imagenet1k_pretrained-2be81763.pth' # noqa + +train_batch_size_per_gpu = 32 +max_epochs = 300 + +# Base learning rate for optim_wrapper +base_lr = 0.01 + +model = dict( + data_preprocessor=dict( + mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], + std=[0.229 * 255., 0.224 * 255., 0.225 * 255.]), + backbone=dict( + block_cfg=dict(use_alpha=False), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint=checkpoint, + map_location='cpu')), + train_cfg=dict(initial_epoch=100)) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict(param_scheduler=dict(total_epochs=int(max_epochs * 1.2))) + +train_cfg = dict(max_epochs=max_epochs) + +# PPYOLOE plus use obj365 pretrained model, but PPYOLOE not, +# `load_from` need to set to None. +load_from = None diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-400e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-400e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..bef9e9130d6194fceeb6471369941050110ace2d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-400e_coco.py @@ -0,0 +1,9 @@ +_base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' + +max_epochs = 400 + +model = dict(train_cfg=dict(initial_epoch=133)) + +default_hooks = dict(param_scheduler=dict(total_epochs=int(max_epochs * 1.2))) + +train_cfg = dict(max_epochs=max_epochs) diff --git a/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_x_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_x_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..fed594f0d08acf2fa64feffa419d0143d1036c55 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/ppyoloe/ppyoloe_x_fast_8xb16-300e_coco.py @@ -0,0 +1,23 @@ +_base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_x_imagenet1k_pretrained-81c33ccb.pth' # noqa + +deepen_factor = 1.33 +widen_factor = 1.25 + +train_batch_size_per_gpu = 16 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) diff --git a/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/README.md b/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..456021bdd32036a31ca9863194dd74a174fcdd76 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/README.md @@ -0,0 +1,79 @@ +# Projecs Based on MMRazor + +There are many research works and pre-trained models built on MMRazor. We list some of them as examples of how to use MMRazor slimmable models for downstream frameworks. As the page might not be completed, please feel free to contribute more efficient mmrazor-models to update this page. + +## Description + +This is an implementation of MMRazor Searchable Backbone Application, we provide detection configs and models for MMRazor in MMYOLO. + +### Backbone support + +Here are the Neural Architecture Search(NAS) Models that come from MMRazor which support YOLO Series. If you are looking for MMRazor models only for Backbone, you could refer to MMRazor [ModelZoo](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/docs/en/get_started/model_zoo.md) and corresponding repository. + +- [x] [AttentiveMobileNetV3](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/configs/_base_/nas_backbones/attentive_mobilenetv3_supernet.py) +- [x] [SearchableShuffleNetV2](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/configs/_base_/nas_backbones/spos_shufflenet_supernet.py) +- [x] [SearchableMobileNetV2](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/configs/_base_/nas_backbones/spos_mobilenet_supernet.py) + +## Usage + +### Prerequisites + +- [MMRazor v1.0.0rc2](https://github.com/open-mmlab/mmrazor/tree/v1.0.0rc2) or higher (dev-1.x) + +Install MMRazor using MIM. + +```shell +mim install mmengine +mim install "mmrazor>=1.0.0rc2" +``` + +Install MMRazor from source + +``` +git clone -b dev-1.x https://github.com/open-mmlab/mmrazor.git +cd mmrazor +# Install MMRazor +mim install -v -e . +``` + +### Training commands + +In MMYOLO's root directory, if you want to use single GPU for training, run the following command to train the model: + +```bash +CUDA_VISIBLE_DEVICES=0 PORT=29500 ./tools/dist_train.sh configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py +``` + +If you want to use several of these GPUs to train in parallel, you can use the following command: + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_train.sh configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py +``` + +### Testing commands + +In MMYOLO's root directory, run the following command to test the model: + +```bash +CUDA_VISIBLE_DEVICES=0 PORT=29500 ./tools/dist_test.sh configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py ${CHECKPOINT_PATH} +``` + +## Results and Models + +Here we provide the baseline version of YOLO Series with NAS backbone. + +| Model | size | box AP | Params(M) | FLOPs(G) | Config | Download | +| :------------------------: | :--: | :----: | :----------: | :------: | :---------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| yolov5-s | 640 | 37.7 | 7.235 | 8.265 | [config](../../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json) | +| yolov5_s_spos_shufflenetv2 | 640 | 38.0 | 7.04(-2.7%) | 7.03 | [config](./yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco_20230211_220635-578be9a9.pth) \| log | +| yolov6-s | 640 | 44.0 | 18.869 | 24.253 | [config](../../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035.log.json) | +| yolov6_l_attentivenas_a6 | 640 | 45.3 | 18.38(-2.6%) | 8.49 | [config](./yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco_20230211_222409-dcc72668.pth) \| log | +| RTMDet-tiny | 640 | 41.0 | 4.8 | 8.1 | [config](../../rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117.log.json) | +| rtmdet_tiny_ofa_lat31 | 960 | 41.3 | 3.91(-18.5%) | 6.09 | [config](./rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco_20230214_210623-449bb2a0.pth) \| log | + +**Note**: + +1. For fair comparison, the training configuration is consistent with the original configuration and results in an improvement of about 0.2-0.5% AP. +2. `yolov5_s_spos_shufflenetv2` achieves 38.0% AP with only 7.042M parameters, directly instead of the backbone, and outperforms `yolov5_s` with a similar size by more than 0.3% AP. +3. With the efficient backbone of `yolov6_l_attentivenas_a6`, the input channels of `YOLOv6RepPAFPN` are reduced. Meanwhile, modify the **deepen_factor** and the neck is made deeper to restore the AP. +4. with the `rtmdet_tiny_ofa_lat31` backbone with only 3.315M parameters and 3.634G flops, we can modify the input resolution to 960, with a similar model size compared to `rtmdet_tiny` and exceeds `rtmdet_tiny` by 0.4% AP, reducing the size of the whole model to 3.91 MB. diff --git a/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..2f9da6685ef0ef920ceb137a165dfb8adcd36254 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py @@ -0,0 +1,124 @@ +_base_ = [ + 'mmrazor::_base_/nas_backbones/ofa_mobilenetv3_supernet.py', + '../../rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py' +] + +checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/ofa/ofa_mobilenet_subnet_8xb256_in1k_note8_lat%4031ms_top1%4072.8_finetune%4025.py_20221214_0939-981a8b2a.pth' # noqa +fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/OFA_SUBNET_NOTE8_LAT31.yaml' # noqa +deepen_factor = 0.167 +widen_factor = 1.0 +channels = [40, 112, 160] +train_batch_size_per_gpu = 16 +img_scale = (960, 960) + +_base_.nas_backbone.out_indices = (2, 4, 5) +_base_.nas_backbone.conv_cfg = dict(type='mmrazor.OFAConv2d') +_base_.nas_backbone.init_cfg = dict( + type='Pretrained', + checkpoint=checkpoint_file, + prefix='architecture.backbone.') +nas_backbone = dict( + type='mmrazor.sub_model', + fix_subnet=fix_subnet, + cfg=_base_.nas_backbone, + extra_prefix='backbone.') + +_base_.model.backbone = nas_backbone +_base_.model.neck.widen_factor = widen_factor +_base_.model.neck.deepen_factor = deepen_factor +_base_.model.neck.in_channels = channels +_base_.model.neck.out_channels = channels[0] +_base_.model.bbox_head.head_module.in_channels = channels[0] +_base_.model.bbox_head.head_module.feat_channels = channels[0] +_base_.model.bbox_head.head_module.widen_factor = widen_factor + +_base_.model.test_cfg = dict( + multi_label=True, + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=20, + random_pop=False, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + scale=(1280, 1280), + ratio_range=(0.5, 2.0), # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOXMixUp', + img_scale=(960, 960), + ratio_range=(1.0, 1.0), + max_cached_images=10, + use_cached=True, + random_pop=False, + pad_val=(114, 114, 114), + prob=0.5), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=(0.5, 2.0), # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, dataset=dict(pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=(960, 960), keep_ratio=True), + dict(type='mmdet.Pad', size=(960, 960), pad_val=dict(img=(114, 114, 114))), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) + +test_dataloader = val_dataloader + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] + +find_unused_parameters = True diff --git a/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..beb4941cfa482ec52e83abc67df70d9734fa3d3a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py @@ -0,0 +1,29 @@ +_base_ = [ + 'mmrazor::_base_/nas_backbones/spos_shufflenet_supernet.py', + '../../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' +] + +checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/spos/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_v3.pth' # noqa +fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/spos/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_subnet_cfg_v3.yaml' # noqa +widen_factor = 1.0 +channels = [160, 320, 640] + +_base_.nas_backbone.out_indices = (1, 2, 3) +_base_.nas_backbone.init_cfg = dict( + type='Pretrained', + checkpoint=checkpoint_file, + prefix='architecture.backbone.') +nas_backbone = dict( + type='mmrazor.sub_model', + fix_subnet=fix_subnet, + cfg=_base_.nas_backbone, + extra_prefix='architecture.backbone.') + +_base_.model.backbone = nas_backbone +_base_.model.neck.widen_factor = widen_factor +_base_.model.neck.in_channels = channels +_base_.model.neck.out_channels = channels +_base_.model.bbox_head.head_module.in_channels = channels +_base_.model.bbox_head.head_module.widen_factor = widen_factor + +find_unused_parameters = True diff --git a/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..0ab64a6460b3fbb29cc1a47a1bd1a2456bb11ac3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/razor/subnets/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,35 @@ +_base_ = [ + 'mmrazor::_base_/nas_backbones/attentive_mobilenetv3_supernet.py', + '../../yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py' +] + +checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/bignas/attentive_mobilenet_subnet_8xb256_in1k_flops-0.93G_acc-80.81_20221229_200440-73d92cc6.pth' # noqa +fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/bignas/ATTENTIVE_SUBNET_A6.yaml' # noqa +deepen_factor = 1.2 +widen_factor = 1 +channels = [40, 128, 224] +mid_channels = [40, 128, 224] + +_base_.train_dataloader.batch_size = 16 +_base_.nas_backbone.out_indices = (2, 4, 6) +_base_.nas_backbone.conv_cfg = dict(type='mmrazor.BigNasConv2d') +_base_.nas_backbone.norm_cfg = dict(type='mmrazor.DynamicBatchNorm2d') +_base_.nas_backbone.init_cfg = dict( + type='Pretrained', + checkpoint=checkpoint_file, + prefix='architecture.backbone.') +nas_backbone = dict( + type='mmrazor.sub_model', + fix_subnet=fix_subnet, + cfg=_base_.nas_backbone, + extra_prefix='backbone.') + +_base_.model.backbone = nas_backbone +_base_.model.neck.widen_factor = widen_factor +_base_.model.neck.deepen_factor = deepen_factor +_base_.model.neck.in_channels = channels +_base_.model.neck.out_channels = mid_channels +_base_.model.bbox_head.head_module.in_channels = mid_channels +_base_.model.bbox_head.head_module.widen_factor = widen_factor + +find_unused_parameters = True diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/README.md b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..94e86546a34c3d70da4b51d81ff46e8ee7d5f242 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/README.md @@ -0,0 +1,83 @@ +# RTMDet: An Empirical Study of Designing Real-Time Object Detectors + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + + + +## Abstract + +In this paper, we aim to design an efficient real-time object detector that exceeds the YOLO series and is easily extensible for many object recognition tasks such as instance segmentation and rotated object detection. To obtain a more efficient model architecture, we explore an architecture that has compatible capacities in the backbone and neck, constructed by a basic building block that consists of large-kernel depth-wise convolutions. We further introduce soft labels when calculating matching costs in the dynamic label assignment to improve accuracy. Together with better training techniques, the resulting object detector, named RTMDet, achieves 52.8% AP on COCO with 300+ FPS on an NVIDIA 3090 GPU, outperforming the current mainstream industrial detectors. RTMDet achieves the best parameter-accuracy trade-off with tiny/small/medium/large/extra-large model sizes for various application scenarios, and obtains new state-of-the-art performance on real-time instance segmentation and rotated object detection. We hope the experimental results can provide new insights into designing versatile real-time object detectors for many object recognition tasks. + +
+ +
+ +
+ +RTMDet-l model structure +
+ +## Results and Models + +### Object Detection + +| Model | size | Params(M) | FLOPs(G) | TRT-FP16-Latency(ms) | box AP | TTA box AP | Config | Download | +| :------------: | :--: | :-------: | :------: | :------------------: | :---------: | :---------: | :---------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| RTMDet-tiny | 640 | 4.8 | 8.1 | 0.98 | 41.0 | 42.7 | [config](./rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117.log.json) | +| RTMDet-tiny \* | 640 | 4.8 | 8.1 | 0.98 | 41.8 (+0.8) | 43.2 (+0.5) | [config](./distillation/kd_tiny_rtmdet_s_neck_300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-e1e4197c.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-176901d8.json) | +| RTMDet-s | 640 | 8.89 | 14.8 | 1.22 | 44.6 | 45.8 | [config](./rtmdet_s_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329.log.json) | +| RTMDet-s \* | 640 | 8.89 | 14.8 | 1.22 | 45.7 (+1.1) | 47.3 (+1.5) | [config](./distillation/kd_s_rtmdet_m_neck_300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-446ff003.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-89862269.json) | +| RTMDet-m | 640 | 24.71 | 39.27 | 1.62 | 49.3 | 50.9 | [config](./rtmdet_m_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952.log.json) | +| RTMDet-m \* | 640 | 24.71 | 39.27 | 1.62 | 50.2 (+0.9) | 51.9 (+1.0) | [config](./distillation/kd_m_rtmdet_l_neck_300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-b806f503.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-bd028fd3.json) | +| RTMDet-l | 640 | 52.3 | 80.23 | 2.44 | 51.4 | 53.1 | [config](./rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928.log.json) | +| RTMDet-l \* | 640 | 52.3 | 80.23 | 2.44 | 52.3 (+0.9) | 53.7 (+0.6) | [config](./distillation/kd_l_rtmdet_x_neck_300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c9979722.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c5c4e17b.json) | +| RTMDet-x | 640 | 94.86 | 141.67 | 3.10 | 52.8 | 54.2 | [config](./rtmdet_x_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345.log.json) | + +**Note**: + +1. The inference speed of RTMDet is measured on an NVIDIA 3090 GPU with TensorRT 8.4.3, cuDNN 8.2.0, FP16, batch size=1, and without NMS. +2. For a fair comparison, the config of bbox postprocessing is changed to be consistent with YOLOv5/6/7 after [PR#9494](https://github.com/open-mmlab/mmdetection/pull/9494), bringing about 0.1~0.3% AP improvement. +3. `TTA` means that Test Time Augmentation. It's perform 3 multi-scaling transformations on the image, followed by 2 flipping transformations (flipping and not flipping). You only need to specify `--tta` when testing to enable. see [TTA](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/common_usage/tta.md) for details. +4. \* means checkpoints are trained with knowledge distillation. More details can be found in [RTMDet distillation](./distillation). + +### Rotated Object Detection + +RTMDet-R achieves state-of-the-art on various remote sensing datasets. + +| Backbone | pretrain | Epoch | Batch Size | Aug | mmAP | mAP50 | mAP75 | Mem (GB) | Params(M) | FLOPS(G) | TRT-FP16-Latency(ms) | Config | Download | +| :---------: | :------: | :---: | :--------: | :-------------: | :---: | :---: | :---: | :------: | :-------: | :------: | :------------------: | :--------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| RTMDet-tiny | IN | 36 | 1xb8 | RR | 46.94 | 75.07 | 50.11 | 12.7 | 4.88 | 20.45 | 4.40 | [config](./rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota/rtmdet-r_tiny_fast_1xb8-36e_dota_20230228_162210-e8ccfb1c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota/rtmdet-r_tiny_fast_1xb8-36e_dota_20230228_162210.log.json) | +| RTMDet-s | IN | 36 | 1xb8 | RR | 48.99 | 77.33 | 52.65 | 16.6 | 8.86 | 37.62 | 4.86 | [config](./rotated/rtmdet-r_s_fast_1xb8-36e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota/rtmdet-r_s_fast_1xb8-36e_dota_20230224_110307-3946a5aa.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota/rtmdet-r_s_fast_1xb8-36e_dota_20230224_110307.log.json) | +| RTMDet-m | IN | 36 | 2xb4 | RR | 50.38 | 78.43 | 54.28 | 10.9 | 24.67 | 99.76 | 7.82 | [config](./rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota/rtmdet-r_m_syncbn_fast_2xb4-36e_dota_20230224_124237-29ae1619.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota/rtmdet-r_m_syncbn_fast_2xb4-36e_dota_20230224_124237.log.json) | +| RTMDet-l | IN | 36 | 2xb4 | RR | 50.61 | 78.66 | 54.95 | 16.1 | 52.27 | 204.21 | 10.82 | [config](./rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota/rtmdet-r_l_syncbn_fast_2xb4-36e_dota_20230224_124544-38bc5f08.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota/rtmdet-r_l_syncbn_fast_2xb4-36e_dota_20230224_124544.log.json) | +| RTMDet-tiny | IN | 36 | 1xb8 | MS+RR | - | - | - | | 4.88 | 20.45 | 4.40 | [config](./rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py) | \| | +| RTMDet-s | IN | 36 | 1xb8 | MS+RR | - | - | - | | 8.86 | 37.62 | 4.86 | [config](./rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py) | \| | +| RTMDet-m | IN | 36 | 2xb4 | MS+RR | - | - | - | | 24.67 | 99.76 | 7.82 | [config](./rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py) | \| | +| RTMDet-l | IN | 36 | 2xb4 | MS+RR | - | - | - | | 52.27 | 204.21 | 10.82 | [config](./rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py) | \| | +| RTMDet-l | COCO | 36 | 2xb4 | MS+RR | - | - | - | | 52.27 | 204.21 | 10.82 | [config](./rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py) | \| | +| RTMDet-l | IN | 100 | 2xb4 | Mixup+Mosaic+RR | 55.05 | 80.14 | 61.32 | 19.6 | 52.27 | 204.21 | 10.82 | [config](./rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota_20230224_124735-ed4ea966.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota_20230224_124735.log.json) | + +**Note**: + +1. Please follow doc to get start with rotated detection. [Rotated Object Detection](../../docs/zh_cn/tutorials/rotated_detection.md) +2. We follow the latest metrics from the DOTA evaluation server, original voc format mAP is now mAP50. +3. All models trained with image size 1024\*1024. +4. `IN` means ImageNet pretrain, `COCO` means COCO pretrain. +5. For Aug, RR means `RandomRotate`, MS means multi-scale augmentation in data prepare. +6. The inference speed here is measured on an NVIDIA 2080Ti GPU with TensorRT 8.4.3, cuDNN 8.2.0, FP16, batch size=1, and with NMS. +7. Currently, the training process of RTMDet-R tiny is unstable and may have 1% accuracy fluctuation, we will continue to investigate why. + +## Citation + +```latex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/README.md b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2db5a50ec5ed0d3b499ca7d3c83bc4963c95af3f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/README.md @@ -0,0 +1,53 @@ +# CSPNeXt ImageNet Pre-training + +In this folder, we provide the imagenet pre-training config of RTMDet's backbone CSPNeXt. + +## Requirements + +To train with these configs, please install [MMClassification 1.x](https://github.com/open-mmlab/mmclassification/tree/1.x) first. + +Install by MIM: + +```shell +mim install mmcls>=1.0.0rc0 +``` + +or install by pip: + +```shell +pip install mmcls>=1.0.0rc0 +``` + +## Prepare Dataset + +To pre-train on ImageNet, you need to prepare the dataset first. Please refer to the [guide](https://mmclassification.readthedocs.io/en/1.x/user_guides/dataset_prepare.html#imagenet). + +## How to Train + +You can use the classification config in the same way as the detection config. + +For single-GPU training, run: + +```shell +python tools/train.py \ + ${CONFIG_FILE} \ + [optional arguments] +``` + +For multi-GPU training, run: + +```shell +bash ./tools/dist_train.sh \ + ${CONFIG_FILE} \ + ${GPU_NUM} \ + [optional arguments] +``` + +More details can be found in [user guides](https://mmdetection.readthedocs.io/en/3.x/user_guides/train.html). + +## Results and Models + +| Model | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download | +| :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------------------------------------------------------: | +| CSPNeXt-tiny | 224x224 | 2.73 | 0.339 | 69.44 | 89.45 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth) | +| CSPNeXt-s | 224x224 | 4.89 | 0.664 | 74.41 | 92.23 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth) | diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-s_8xb256-rsb-a1-600e_in1k.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-s_8xb256-rsb-a1-600e_in1k.py new file mode 100644 index 0000000000000000000000000000000000000000..4281f9cd7d260f22d7b0e8d18d2c4f56866ad840 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-s_8xb256-rsb-a1-600e_in1k.py @@ -0,0 +1,67 @@ +_base_ = [ + 'mmcls::_base_/datasets/imagenet_bs256_rsb_a12.py', + 'mmcls::_base_/schedules/imagenet_bs2048_rsb.py', + 'mmcls::_base_/default_runtime.py' +] + +custom_imports = dict( + imports=['mmdet.models', 'mmyolo.models'], allow_failed_imports=False) + +model = dict( + type='ImageClassifier', + backbone=dict( + type='mmyolo.CSPNeXt', + arch='P5', + out_indices=(4, ), + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='mmyolo.SiLU')), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=512, + loss=dict( + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='original', + loss_weight=1.0), + topk=(1, 5)), + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.2, num_classes=1000), + dict(type='CutMix', alpha=1.0, num_classes=1000) + ])) + +# dataset settings +train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True)) + +# schedule settings +optim_wrapper = dict( + optimizer=dict(weight_decay=0.01), + paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.), +) + +param_scheduler = [ + # warm up learning rate scheduler + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=True, + begin=0, + end=5, + # update by iter + convert_to_iter_based=True), + # main learning rate scheduler + dict( + type='CosineAnnealingLR', + T_max=595, + eta_min=1.0e-6, + by_epoch=True, + begin=5, + end=600) +] + +train_cfg = dict(by_epoch=True, max_epochs=600) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py new file mode 100644 index 0000000000000000000000000000000000000000..af3170bdc51778c4601d4426aa88cc27c608f100 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py @@ -0,0 +1,5 @@ +_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py' + +model = dict( + backbone=dict(deepen_factor=0.167, widen_factor=0.375), + head=dict(in_channels=384)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/README.md b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..452a46cb9904a1782c0fee9cd7d469c0749caadb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/README.md @@ -0,0 +1,146 @@ +# Distill RTM Detectors Based on MMRazor + +## Description + +To further improve the model accuracy while not introducing much additional +computation cost, we apply the feature-based distillation to the training phase +of these RTM detectors. In summary, our distillation strategy are threefold: + +(1) Inspired by [PKD](https://arxiv.org/abs/2207.02039), we first normalize +the intermediate feature maps to have zero mean and unit variances before calculating +the distillation loss. + +(2) Inspired by [CWD](https://arxiv.org/abs/2011.13256), we adopt the channel-wise +distillation paradigm, which can pay more attention to the most salient regions +of each channel. + +(3) Inspired by [DAMO-YOLO](https://arxiv.org/abs/2211.15444), the distillation +process is split into two stages. 1) The teacher distills the student at the +first stage (280 epochs) on strong mosaic domain. 2) The student finetunes itself +on no masaic domain at the second stage (20 epochs). + +## Results and Models + +| Location | Dataset | Teacher | Student | mAP | mAP(T) | mAP(S) | Config | Download | +| :------: | :-----: | :---------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------: | :---------: | :----: | :----: | :------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| FPN | COCO | [RTMDet-s](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-tiny](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py) | 41.8 (+0.8) | 44.6 | 41.0 | [config](kd_tiny_rtmdet_s_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-e1e4197c.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-176901d8.json) | +| FPN | COCO | [RTMDet-m](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-s](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py) | 45.7 (+1.1) | 49.3 | 44.6 | [config](kd_s_rtmdet_m_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-446ff003.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-89862269.json) | +| FPN | COCO | [RTMDet-l](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-m](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py) | 50.2 (+0.9) | 51.4 | 49.3 | [config](kd_m_rtmdet_l_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-b806f503.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-bd028fd3.json) | +| FPN | COCO | [RTMDet-x](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-l](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | 52.3 (+0.9) | 52.8 | 51.4 | [config](kd_l_rtmdet_x_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c9979722.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c5c4e17b.json) | + +## Usage + +### Prerequisites + +- [MMRazor dev-1.x](https://github.com/open-mmlab/mmrazor/tree/dev-1.x) + +Install MMRazor from source + +``` +git clone -b dev-1.x https://github.com/open-mmlab/mmrazor.git +cd mmrazor +# Install MMRazor +mim install -v -e . +``` + +### Training commands + +In MMYOLO's root directory, run the following command to train the RTMDet-tiny +with 8 GPUs, using RTMDet-s as the teacher: + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_train.sh configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py +``` + +### Testing commands + +In MMYOLO's root directory, run the following command to test the model: + +```bash +CUDA_VISIBLE_DEVICES=0 PORT=29500 ./tools/dist_test.sh configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py ${CHECKPOINT_PATH} +``` + +### Getting student-only checkpoint + +After training, the checkpoint contains parameters for both student and teacher models. +Run the following command to convert it to student-only checkpoint: + +```bash +python ./tools/model_converters/convert_kd_ckpt_to_student.py ${CHECKPOINT_PATH} --out-path ${OUTPUT_CHECKPOINT_PATH} +``` + +## Configs + +Here we provide detection configs and models for MMRazor in MMYOLO. For clarify, +we take `./kd_tiny_rtmdet_s_neck_300e_coco.py` as an example to show how to +distill a RTM detector based on MMRazor. + +Here is the main part of `./kd_tiny_rtmdet_s_neck_300e_coco.py`. + +```shell +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +distiller=dict( + type='ConfigurableDistiller', + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + connectors=dict( + fpn0_s=dict(type='ConvModuleConnector', in_channel=96, + out_channel=128, bias=False, norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', in_channel=96, + out_channel=128, bias=False, norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', in_channel=96, + out_channel=128, bias=False, norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict(from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict(from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict(from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict(from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict(from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict(from_student=False, recorder='fpn2', connector='fpn2_t')))) + +``` + +`recorders` are used to record various intermediate results during the model forward. +In this example, they can help record the output of 3 `nn.Module` of the teacher +and the student. Details are list in [Recorder](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/docs/en/advanced_guides/recorder.md) and [MMRazor Distillation](https://zhuanlan.zhihu.com/p/596582609) (if users can read Chinese). + +`connectors` are adaptive layers which usually map teacher's and students features +to the same dimension. + +`distill_losses` are configs for multiple distill losses. + +`loss_forward_mappings` are mappings between distill loss forward arguments and records. + +In addition, the student finetunes itself on no masaic domain at the last 20 epochs, +so we add a new hook named `StopDistillHook` to stop distillation on time. +We need to add this hook to the `custom_hooks` list like this: + +```shell +custom_hooks = [..., dict(type='mmrazor.StopDistillHook', detach_epoch=280)] +``` diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_l_rtmdet_x_neck_300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_l_rtmdet_x_neck_300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..2bab26a0d20342c38d7d1ec0a8221fdc426f016b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_l_rtmdet_x_neck_300e_coco.py @@ -0,0 +1,99 @@ +_base_ = '../rtmdet_l_syncbn_fast_8xb32-300e_coco.py' + +teacher_ckpt = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth' # noqa: E501 + +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='FpnTeacherDistill', + architecture=dict( + cfg_path='mmyolo::rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py'), + teacher=dict( + cfg_path='mmyolo::rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py'), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + # `recorders` are used to record various intermediate results during + # the model forward. + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + # `connectors` are adaptive layers which usually map teacher's and + # students features to the same dimension. + connectors=dict( + fpn0_s=dict( + type='ConvModuleConnector', + in_channel=256, + out_channel=320, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=320, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', + in_channel=256, + out_channel=320, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=320, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', + in_channel=256, + out_channel=320, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=320, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + # `loss_forward_mappings` are mappings between distill loss forward + # arguments and records. + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict( + from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict( + from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict( + from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict( + from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict( + from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict( + from_student=False, recorder='fpn2', + connector='fpn2_t'))))) + +find_unused_parameters = True + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=_base_.train_pipeline_stage2), + # stop distillation after the 280th epoch + dict(type='mmrazor.StopDistillHook', stop_epoch=280) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_m_rtmdet_l_neck_300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_m_rtmdet_l_neck_300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f7d7f9211f1f77c4d83677f7f6c485a5c6212252 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_m_rtmdet_l_neck_300e_coco.py @@ -0,0 +1,99 @@ +_base_ = '../rtmdet_m_syncbn_fast_8xb32-300e_coco.py' + +teacher_ckpt = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth' # noqa: E501 + +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='FpnTeacherDistill', + architecture=dict( + cfg_path='mmyolo::rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py'), + teacher=dict( + cfg_path='mmyolo::rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py'), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + # `recorders` are used to record various intermediate results during + # the model forward. + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + # `connectors` are adaptive layers which usually map teacher's and + # students features to the same dimension. + connectors=dict( + fpn0_s=dict( + type='ConvModuleConnector', + in_channel=192, + out_channel=256, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=256, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', + in_channel=192, + out_channel=256, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=256, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', + in_channel=192, + out_channel=256, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=256, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + # `loss_forward_mappings` are mappings between distill loss forward + # arguments and records. + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict( + from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict( + from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict( + from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict( + from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict( + from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict( + from_student=False, recorder='fpn2', + connector='fpn2_t'))))) + +find_unused_parameters = True + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=_base_.train_pipeline_stage2), + # stop distillation after the 280th epoch + dict(type='mmrazor.StopDistillHook', stop_epoch=280) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_s_rtmdet_m_neck_300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_s_rtmdet_m_neck_300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..99b5dc5e48d04fed927cbd80c1538ca99912fc1b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_s_rtmdet_m_neck_300e_coco.py @@ -0,0 +1,99 @@ +_base_ = '../rtmdet_s_syncbn_fast_8xb32-300e_coco.py' + +teacher_ckpt = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth' # noqa: E501 + +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='FpnTeacherDistill', + architecture=dict( + cfg_path='mmyolo::rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py'), + teacher=dict( + cfg_path='mmyolo::rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py'), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + # `recorders` are used to record various intermediate results during + # the model forward. + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + # `connectors` are adaptive layers which usually map teacher's and + # students features to the same dimension. + connectors=dict( + fpn0_s=dict( + type='ConvModuleConnector', + in_channel=128, + out_channel=192, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=192, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', + in_channel=128, + out_channel=192, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=192, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', + in_channel=128, + out_channel=192, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=192, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + # `loss_forward_mappings` are mappings between distill loss forward + # arguments and records. + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict( + from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict( + from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict( + from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict( + from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict( + from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict( + from_student=False, recorder='fpn2', + connector='fpn2_t'))))) + +find_unused_parameters = True + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=_base_.train_pipeline_stage2), + # stop distillation after the 280th epoch + dict(type='mmrazor.StopDistillHook', stop_epoch=280) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..50c23580bf6b7c1a120267a65bc7cc334513c475 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py @@ -0,0 +1,99 @@ +_base_ = '../rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py' + +teacher_ckpt = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth' # noqa: E501 + +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='FpnTeacherDistill', + architecture=dict( + cfg_path='mmyolo::rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py'), + teacher=dict( + cfg_path='mmyolo::rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py'), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + # `recorders` are used to record various intermediate results during + # the model forward. + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + # `connectors` are adaptive layers which usually map teacher's and + # students features to the same dimension. + connectors=dict( + fpn0_s=dict( + type='ConvModuleConnector', + in_channel=96, + out_channel=128, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', + in_channel=96, + out_channel=128, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', + in_channel=96, + out_channel=128, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + # `loss_forward_mappings` are mappings between distill loss forward + # arguments and records. + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict( + from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict( + from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict( + from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict( + from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict( + from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict( + from_student=False, recorder='fpn2', + connector='fpn2_t'))))) + +find_unused_parameters = True + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=_base_.train_pipeline_stage2), + # stop distillation after the 280th epoch + dict(type='mmrazor.StopDistillHook', stop_epoch=280) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/metafile.yml b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..704a44ba83c90d1c639d4bcbabf88b72fa867553 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/metafile.yml @@ -0,0 +1,215 @@ +Collections: + - Name: RTMDet + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Flat Cosine Annealing + Training Resources: 8x A100 GPUs + Architecture: + - CSPNeXt + - CSPNeXtPAFPN + README: configs/rtmdet/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.1 + - Name: Rotated_RTMDet + Metadata: + Training Data: DOTAv1.0 + Training Techniques: + - AdamW + - Flat Cosine Annealing + Training Resources: 1x A100 GPUs + Architecture: + - CSPNeXt + - CSPNeXtPAFPN + README: configs/rtmdet/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.1 + +Models: + - Name: rtmdet_tiny_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 11.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth + + - Name: kd_tiny_rtmdet_s_neck_300e_coco + In Collection: RTMDet + Config: configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py + Metadata: + Training Memory (GB): 11.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.8 + Weights: https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-e1e4197c.pth + + - Name: rtmdet_s_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 15.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.6 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth + + - Name: kd_s_rtmdet_m_neck_300e_coco + In Collection: RTMDet + Config: configs/rtmdet/distillation/kd_s_rtmdet_m_neck_300e_coco.py + Metadata: + Training Memory (GB): 16.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.7 + Weights: https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-446ff003.pth + + - Name: rtmdet_m_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 27.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.3 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth + + - Name: kd_m_rtmdet_l_neck_300e_coco + In Collection: RTMDet + Config: configs/rtmdet/distillation/kd_m_rtmdet_l_neck_300e_coco.py + Metadata: + Training Memory (GB): 29.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.2 + Weights: https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-b806f503.pth + + - Name: rtmdet_l_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 43.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.4 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth + + - Name: kd_l_rtmdet_x_neck_300e_coco + In Collection: RTMDet + Config: configs/rtmdet/distillation/kd_l_rtmdet_x_neck_300e_coco.py + Metadata: + Training Memory (GB): 45.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.3 + Weights: https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c9979722.pth + + - Name: rtmdet_x_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 63.4 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.8 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth + + - Name: rtmdet-r_tiny_fast_1xb8-36e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py + Metadata: + Training Memory (GB): 12.7 + Epochs: 36 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 75.07 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota/rtmdet-r_tiny_fast_1xb8-36e_dota_20230228_162210-e8ccfb1c.pth + + - Name: rtmdet-r_s_fast_1xb8-36e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py + Metadata: + Training Memory (GB): 16.6 + Epochs: 36 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 77.33 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota/rtmdet-r_s_fast_1xb8-36e_dota_20230224_110307-3946a5aa.pth + + - Name: rtmdet-r_m_syncbn_fast_2xb4-36e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py + Metadata: + Training Resources: 2x A100 GPUs + Training Memory (GB): 10.9 + Epochs: 36 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 78.43 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota/rtmdet-r_m_syncbn_fast_2xb4-36e_dota_20230224_124237-29ae1619.pth + + - Name: rtmdet-r_l_syncbn_fast_2xb4-36e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py + Metadata: + Training Resources: 2x A100 GPUs + Training Memory (GB): 16.1 + Epochs: 36 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 78.66 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota/rtmdet-r_l_syncbn_fast_2xb4-36e_dota_20230224_124544-38bc5f08.pth + + - Name: rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py + Metadata: + Training Resources: 2x A100 GPUs + Training Memory (GB): 19.6 + Epochs: 100 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 80.14 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota_20230224_124735-ed4ea966.pth diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..ef29a1d051b84d8c546edb3cabb958ec586e1261 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py @@ -0,0 +1,30 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +# ========================modified parameters====================== +data_root = 'data/split_ms_dota/' +# Path of test images folder +test_data_prefix = 'test/images/' +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +train_dataloader = dict(dataset=dict(data_root=data_root)) + +val_dataloader = dict(dataset=dict(data_root=data_root)) + +# Inference on val dataset +test_dataloader = val_dataloader + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..cbb2ae77a370a73e463068e11291afb4a59cda02 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py @@ -0,0 +1,331 @@ +_base_ = '../../_base_/default_runtime.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth' # noqa + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/split_ss_dota/' +# Path of train annotation folder +train_ann_file = 'trainval/annfiles/' +train_data_prefix = 'trainval/images/' # Prefix of train image path +# Path of val annotation folder +val_ann_file = 'trainval/annfiles/' +val_data_prefix = 'trainval/images/' # Prefix of val image path +# Path of test images folder +test_data_prefix = 'test/images/' + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +num_classes = 15 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 4 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0. +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 1xb8=8 bs +base_lr = 0.00025 # 0.004 / 16 +max_epochs = 36 # Maximum training epochs + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # Decode rbox with angle, For RTMDet-R, Defaults to True. + # When set to True, use rbox coder such as DistanceAnglePointCoder + # When set to False, use hbox coder such as DistancePointBBoxCoder + # different setting lead to different AP. + decode_with_angle=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.05, # Threshold to filter out boxes. + nms=dict(type='nms_rotated', iou_threshold=0.1), # NMS type and threshold + max_per_img=2000) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (1024, 1024) # width, height +# ratio for random rotate +random_rotate_ratio = 0.5 +# label ids for rect objs +rotate_rect_obj_labels = [9, 11] +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5DOTADataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 8 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 8 + +# Config of batch shapes. Only on val. Not use in RTMDet-R +batch_shapes_cfg = None + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 1.0 +# The scaling factor that controls the width of the network structure +widen_factor = 1.0 +# Strides of multi-scale prior box +strides = [8, 16, 32] +# The angle definition for model +angle_version = 'le90' # le90, le135, oc are available options + +norm_cfg = dict(type='BN') # Normalization config + +# -----train val related----- +lr_start_factor = 1.0e-5 +dsl_topk = 13 # Number of bbox selected in each level +loss_cls_weight = 1.0 +loss_bbox_weight = 2.0 +qfl_beta = 2.0 # beta of QualityFocalLoss +weight_decay = 0.05 + +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 1 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + channel_attention=True, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict( + type='CSPNeXtPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=256, + num_csp_blocks=3, + expand_ratio=0.5, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='RTMDetRotatedHead', + head_module=dict( + type='RTMDetRotatedSepBNHeadModule', + num_classes=num_classes, + widen_factor=widen_factor, + in_channels=256, + stacked_convs=2, + feat_channels=256, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + share_conv=True, + pred_kernel_size=1, + featmap_strides=strides), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0, strides=strides), + bbox_coder=dict( + type='DistanceAnglePointCoder', angle_version=angle_version), + loss_cls=dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=qfl_beta, + loss_weight=loss_cls_weight), + loss_bbox=dict( + type='mmrotate.RotatedIoULoss', + mode='linear', + loss_weight=loss_bbox_weight), + angle_version=angle_version, + # Used for angle encode and decode, similar to bbox coder + angle_coder=dict(type='mmrotate.PseudoAngleCoder'), + # If true, it will apply loss_bbox on horizontal box, and angle_loss + # needs to be specified. In this case the loss_bbox should use + # horizontal box loss e.g. IoULoss. Arg details can be seen in + # `docs/zh_cn/tutorials/rotated_detection.md` + use_hbbox_loss=False, + loss_angle=None), + train_cfg=dict( + assigner=dict( + type='BatchDynamicSoftLabelAssigner', + num_classes=num_classes, + topk=dsl_topk, + iou_calculator=dict(type='mmrotate.RBboxOverlaps2D'), + # RBboxOverlaps2D doesn't support batch input, use loop instead. + batch_iou=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=model_test_cfg, +) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True, box_type='qbox'), + dict( + type='mmrotate.ConvertBoxType', + box_type_mapping=dict(gt_bboxes='rbox')), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.RandomFlip', + prob=0.75, + direction=['horizontal', 'vertical', 'diagonal']), + dict( + type='mmrotate.RandomRotate', + prob=random_rotate_ratio, + angle_range=180, + rotate_type='mmrotate.Rotate', + rect_obj_labels=rotate_rect_obj_labels), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='RegularizeRotatedBox', angle_version=angle_version), + dict(type='mmdet.PackDetInputs') +] + +val_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='LoadAnnotations', + with_bbox=True, + box_type='qbox', + _scope_='mmdet'), + dict( + type='mmrotate.ConvertBoxType', + box_type_mapping=dict(gt_bboxes='rbox')), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + collate_fn=dict(type='yolov5_collate'), + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img_path=train_data_prefix), + filter_cfg=dict(filter_empty_gt=True), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img_path=val_data_prefix), + test_mode=True, + batch_shapes_cfg=batch_shapes_cfg, + pipeline=val_pipeline)) + +val_evaluator = dict(type='mmrotate.DOTAMetric', metric='mAP') + +# Inference on val dataset +test_dataloader = val_dataloader +test_evaluator = val_evaluator + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# batch_size=val_batch_size_per_gpu, +# num_workers=val_num_workers, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type='DefaultSampler', shuffle=False), +# dataset=dict( +# type=dataset_type, +# data_root=data_root, +# data_prefix=dict(img_path=test_data_prefix), +# test_mode=True, +# batch_shapes_cfg=batch_shapes_cfg, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=weight_decay), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=lr_start_factor, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + max_keep_ckpts=max_keep_ckpts, # only keep latest 3 checkpoints + save_best='auto')) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +visualizer = dict(type='mmrotate.RotLocalVisualizer') diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..dcafa55db97ffd543af3bc382d15de361cadbd75 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py @@ -0,0 +1,168 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +# This config use longer schedule with Mixup, Mosaic and Random Rotate. + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth' # noqa + +# ========================modified parameters====================== + +# Base learning rate for optim_wrapper. Corresponding to 1xb8=8 bs +base_lr = 0.00025 # 0.004 / 16 +lr_start_factor = 1.0e-5 +max_epochs = 100 # Maximum training epochs +# Change train_pipeline for final 10 epochs (stage 2) +num_epochs_stage2 = 10 + +img_scale = (1024, 1024) # width, height +# ratio range for random resize +random_resize_ratio_range = (0.1, 2.0) +# Cached images number in mosaic +mosaic_max_cached_images = 40 +# Number of cached images in mixup +mixup_max_cached_images = 20 +# ratio for random rotate +random_rotate_ratio = 0.5 +# label ids for rect objs +rotate_rect_obj_labels = [9, 11] + +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 1 +# validation intervals in stage 2 +val_interval_stage2 = 1 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True, box_type='qbox'), + dict( + type='mmrotate.ConvertBoxType', + box_type_mapping=dict(gt_bboxes='rbox')), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict( + type='mmrotate.RandomRotate', + prob=random_rotate_ratio, + angle_range=180, + rotate_type='mmrotate.Rotate', + rect_obj_labels=rotate_rect_obj_labels), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='mmdet.RandomFlip', + prob=0.75, + direction=['horizontal', 'vertical', 'diagonal']), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + max_cached_images=mixup_max_cached_images), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True, box_type='qbox'), + dict( + type='mmrotate.ConvertBoxType', + box_type_mapping=dict(gt_bboxes='rbox')), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict( + type='mmrotate.RandomRotate', + prob=random_rotate_ratio, + angle_range=180, + rotate_type='mmrotate.Rotate', + rect_obj_labels=rotate_rect_obj_labels), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='mmdet.RandomFlip', + prob=0.75, + direction=['horizontal', 'vertical', 'diagonal']), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=lr_start_factor, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + max_keep_ckpts=max_keep_ckpts, # only keep latest 3 checkpoints + save_best='auto')) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals, + dynamic_intervals=[(max_epochs - num_epochs_stage2, val_interval_stage2)]) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..1a9f50cdded21c36f9b76b49e291b60e0a2dff07 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py @@ -0,0 +1,20 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth' # noqa + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..4be8605f6de383c4e39edae6cfdc19f5ea005353 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py @@ -0,0 +1,33 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..8df61cffd6e165e36965b2622735abb93fbe8d83 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py @@ -0,0 +1,33 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..2b7b0b6ffee9cdf2720696ce6fe51b87927ada6e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py @@ -0,0 +1,38 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.5 + +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..d200dd76491dafb306900de23a25359224205d13 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py @@ -0,0 +1,38 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.5 + +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..56bf038b6500bb0640160e680ddbb5e4c34fd3f8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py @@ -0,0 +1,38 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.167 +widen_factor = 0.375 + +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..739a2de8020ad6879a8401255395df2e807f66c4 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py @@ -0,0 +1,38 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.167 +widen_factor = 0.375 + +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..279a7990bc4a58a5c10bfc3dd29e570c7e3a14cc --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,31 @@ +_base_ = './rtmdet_s_syncbn_fast_8xb32-300e_coco.py' + +widen_factor = 0.5 + +model = dict( + bbox_head=dict( + type='RTMDetInsSepBNHead', + head_module=dict( + type='RTMDetInsSepBNHeadModule', + use_sigmoid_cls=True, + widen_factor=widen_factor), + loss_mask=dict( + type='mmdet.DiceLoss', loss_weight=2.0, eps=5e-6, + reduction='mean')), + test_cfg=dict( + multi_label=True, + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100, + mask_thr_binary=0.5)) + +_base_.test_pipeline[-2] = dict( + type='LoadAnnotations', with_bbox=True, with_mask=True, _scope_='mmdet') + +val_dataloader = dict(dataset=dict(pipeline=_base_.test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(metric=['bbox', 'segm']) +test_evaluator = val_evaluator diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..c36ac38ce16db6bbd66fe0c2271c34c252a538ab --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,304 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 32 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 10 +# persistent_workers must be False if num_workers is 0. +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=64 bs +base_lr = 0.004 +max_epochs = 300 # Maximum training epochs +# Change train_pipeline for final 20 epochs (stage 2) +num_epochs_stage2 = 20 + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.65), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# ratio range for random resize +random_resize_ratio_range = (0.1, 2.0) +# Cached images number in mosaic +mosaic_max_cached_images = 40 +# Number of cached images in mixup +mixup_max_cached_images = 20 +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 32 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 10 + +# Config of batch shapes. Only on val. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + size_divisor=32, + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 1.0 +# The scaling factor that controls the width of the network structure +widen_factor = 1.0 +# Strides of multi-scale prior box +strides = [8, 16, 32] + +norm_cfg = dict(type='BN') # Normalization config + +# -----train val related----- +lr_start_factor = 1.0e-5 +dsl_topk = 13 # Number of bbox selected in each level +loss_cls_weight = 1.0 +loss_bbox_weight = 2.0 +qfl_beta = 2.0 # beta of QualityFocalLoss +weight_decay = 0.05 + +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 10 +# validation intervals in stage 2 +val_interval_stage2 = 1 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + channel_attention=True, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='CSPNeXtPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=256, + num_csp_blocks=3, + expand_ratio=0.5, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='RTMDetHead', + head_module=dict( + type='RTMDetSepBNHeadModule', + num_classes=num_classes, + in_channels=256, + stacked_convs=2, + feat_channels=256, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + share_conv=True, + pred_kernel_size=1, + featmap_strides=strides), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0, strides=strides), + bbox_coder=dict(type='DistancePointBBoxCoder'), + loss_cls=dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=qfl_beta, + loss_weight=loss_cls_weight), + loss_bbox=dict(type='mmdet.GIoULoss', loss_weight=loss_bbox_weight)), + train_cfg=dict( + assigner=dict( + type='BatchDynamicSoftLabelAssigner', + num_classes=num_classes, + topk=dsl_topk, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=model_test_cfg, +) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + max_cached_images=mixup_max_cached_images), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + collate_fn=dict(type='yolov5_collate'), + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img=val_data_prefix), + test_mode=True, + batch_shapes_cfg=batch_shapes_cfg, + pipeline=test_pipeline)) + +test_dataloader = val_dataloader + +# Reduce evaluation time +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=weight_decay), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=lr_start_factor, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + max_keep_ckpts=max_keep_ckpts # only keep latest 3 checkpoints + )) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals, + dynamic_intervals=[(max_epochs - num_epochs_stage2, val_interval_stage2)]) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..52576bf41689f462e46e83e6236de91ead43e97c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,11 @@ +_base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8cead7805974a0a9434f41623ab92beb87fadc60 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,92 @@ +_base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py' +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.5 +img_scale = _base_.img_scale + +# ratio range for random resize +random_resize_ratio_range = (0.5, 2.0) +# Number of cached images in mosaic +mosaic_max_cached_images = 40 +# Number of cached images in mixup +mixup_max_cached_images = 20 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + # Since the checkpoint includes CUDA:0 data, + # it must be forced to set map_location. + # Once checkpoint is fixed, it can be removed. + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint=checkpoint, + map_location='cpu')), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + max_cached_images=mixup_max_cached_images), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=random_resize_ratio_range, # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_fast_1xb12-40e_cat.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..8d1182c5ef663efdf06801c6cc22991b9545b2ea --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_fast_1xb12-40e_cat.py @@ -0,0 +1,70 @@ +_base_ = 'rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +num_epochs_stage2 = 5 + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 +val_batch_size_per_gpu = 1 +val_num_workers = 2 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict(assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=_base_.lr_start_factor, + by_epoch=False, + begin=0, + end=30), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=_base_.base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +_base_.custom_hooks[1].switch_epoch = max_epochs - num_epochs_stage2 + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..257110d22e9f2330e4c5378001eaf72f6bb885d1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,58 @@ +_base_ = './rtmdet_s_syncbn_fast_8xb32-300e_coco.py' +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.167 +widen_factor = 0.375 +img_scale = _base_.img_scale + +# ratio range for random resize +random_resize_ratio_range = (0.5, 2.0) +# Number of cached images in mosaic +mosaic_max_cached_images = 20 +# Number of cached images in mixup +mixup_max_cached_images = 10 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, # note + random_pop=False, # note + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + random_pop=False, + max_cached_images=mixup_max_cached_images, + prob=0.5), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..7fc9001f99ef3d468994c8201d43f08500bdeef9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,11 @@ +_base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.33 +widen_factor = 1.25 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/README.md b/models/YOLO-World/third_party/mmyolo/configs/yolov5/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd33e83f430b9309e4c0e95902a61db0dd7ae002 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/README.md @@ -0,0 +1,146 @@ +# YOLOv5 + + + +## Abstract + +YOLOv5 is a family of object detection architectures and models pretrained on the COCO dataset, and represents Ultralytics open-source research into future vision AI methods, incorporating lessons learned and best practices evolved over thousands of hours of research and development. + +
+ +YOLOv5-l-P5 model structure +
+ +
+ +YOLOv5-l-P6 model structure +
+ +## Results and models + +### COCO + +| Backbone | Arch | size | Mask Refine | SyncBN | AMP | Mem (GB) | box AP | TTA box AP | Config | Download | +| :-------: | :--: | :--: | :---------: | :----: | :-: | :------: | :---------: | :--------: | :-----------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv5-n | P5 | 640 | No | Yes | Yes | 1.5 | 28.0 | 30.7 | [config](./yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739.log.json) | +| YOLOv5-n | P5 | 640 | Yes | Yes | Yes | 1.5 | 28.0 | | [config](./mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_152706-712fb1b2.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_152706.log.json) | +| YOLOv5u-n | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_n_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-s | P5 | 640 | No | Yes | Yes | 2.7 | 37.7 | 40.2 | [config](./yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json) | +| YOLOv5-s | P5 | 640 | Yes | Yes | Yes | 2.7 | 38.0 (+0.3) | | [config](./mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230304_033134-8e0cd271.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230304_033134.log.json) | +| YOLOv5u-s | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_s_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-m | P5 | 640 | No | Yes | Yes | 5.0 | 45.3 | 46.9 | [config](./yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944.log.json) | +| YOLOv5-m | P5 | 640 | Yes | Yes | Yes | 5.0 | 45.3 | | [config](./mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_153946-44e96155.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_153946.log.json) | +| YOLOv5u-m | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_m_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-l | P5 | 640 | No | Yes | Yes | 8.1 | 48.8 | 49.9 | [config](./yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007.log.json) | +| YOLOv5-l | P5 | 640 | Yes | Yes | Yes | 8.1 | 49.3 (+0.5) | | [config](./mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154301-2c1d912a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154301.log.json) | +| YOLOv5u-l | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_l_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-x | P5 | 640 | No | Yes | Yes | 12.2 | 50.2 | | [config](./yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco/yolov5_x-v61_syncbn_fast_8xb16-300e_coco_20230305_152943-00776a4b.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco/yolov5_x-v61_syncbn_fast_8xb16-300e_coco_20230305_152943.log.json) | +| YOLOv5-x | P5 | 640 | Yes | Yes | Yes | 12.2 | 50.9 (+0.7) | | [config](./mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321-07edeb62.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321.log.json) | +| YOLOv5u-x | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_x_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-n | P6 | 1280 | No | Yes | Yes | 5.8 | 35.9 | | [config](./yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705-d493c5f3.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705.log.json) | +| YOLOv5-s | P6 | 1280 | No | Yes | Yes | 10.5 | 44.4 | | [config](./yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044-58865c19.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044.log.json) | +| YOLOv5-m | P6 | 1280 | No | Yes | Yes | 19.1 | 51.3 | | [config](./yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453-49564d58.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453.log.json) | +| YOLOv5-l | P6 | 1280 | No | Yes | Yes | 30.5 | 53.7 | | [config](./yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308-7a2ba6bf.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308.log.json) | + +**Note**: + +1. `fast` means that `YOLOv5DetDataPreprocessor` and `yolov5_collate` are used for data preprocessing, which is faster for training, but less flexible for multitasking. Recommended to use fast version config if you only care about object detection. +2. `detect` means that the network input is fixed to `640x640` and the post-processing thresholds is modified. +3. `SyncBN` means use SyncBN, `AMP` indicates training with mixed precision. +4. We use 8x A100 for training, and the single-GPU batch size is 16. This is different from the official code. +5. The performance is unstable and may fluctuate by about 0.4 mAP and the highest performance weight in `COCO` training in `YOLOv5` may not be the last epoch. +6. `TTA` means that Test Time Augmentation. It's perform 3 multi-scaling transformations on the image, followed by 2 flipping transformations (flipping and not flipping). You only need to specify `--tta` when testing to enable. see [TTA](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/common_usage/tta.md) for details. +7. The performance of `Mask Refine` training is for the weight performance officially released by YOLOv5. `Mask Refine` means refining bbox by mask while loading annotations and transforming after `YOLOv5RandomAffine`, `Copy Paste` means using `YOLOv5CopyPaste`. +8. `YOLOv5u` models use the same loss functions and split Detect head as `YOLOv8` models for improved performance, but only requires 300 epochs. + +### COCO Instance segmentation + +| Backbone | Arch | size | SyncBN | AMP | Mem (GB) | Box AP | Mask AP | Config | Download | +| :-------------------: | :--: | :--: | :----: | :-: | :------: | :----: | :-----: | :--------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv5-n | P5 | 640 | Yes | Yes | 3.3 | 27.9 | 23.7 | [config](./ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807-84cc9240.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807.log.json) | +| YOLOv5-s | P5 | 640 | Yes | Yes | 4.8 | 38.1 | 32.0 | [config](./ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542-3e570436.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542.log.json) | +| YOLOv5-s(non-overlap) | P5 | 640 | Yes | Yes | 4.8 | 38.0 | 32.1 | [config](./ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642-6780d34e.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642.log.json) | +| YOLOv5-m | P5 | 640 | Yes | Yes | 7.3 | 45.1 | 37.3 | [config](./ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529-ef5ba1a9.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529.log.json) | +| YOLOv5-l | P5 | 640 | Yes | Yes | 10.7 | 48.8 | 39.9 | [config](./ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_104049-daa09f70.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_104049.log.json) | +| YOLOv5-x | P5 | 640 | Yes | Yes | 15.0 | 50.6 | 41.4 | [config](./ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_103925-a260c798.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_103925.log.json) | + +**Note**: + +1. `Non-overlap` refers to the instance-level masks being stored in the format (num_instances, h, w) instead of (h, w). Storing masks in overlap format consumes less memory and GPU memory. +2. For the M model, the `affine_scale` parameter should be 0.9, but due to some reason, we set it to 0.5 and found that the mAP did not change. Therefore, the released M model has an `affine_scale` parameter of 0.5, which is inconsistent with the value of 0.9 in the configuration. + +### VOC + +| Backbone | size | Batchsize | AMP | Mem (GB) | box AP(COCO metric) | Config | Download | +| :------: | :--: | :-------: | :-: | :------: | :-----------------: | :-------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv5-n | 512 | 64 | Yes | 3.5 | 51.2 | [config](./yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254-f1493430.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254.log.json) | +| YOLOv5-s | 512 | 64 | Yes | 6.5 | 62.7 | [config](./yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156-0009b33e.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156.log.json) | +| YOLOv5-m | 512 | 64 | Yes | 12.0 | 70.1 | [config](./yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138-815c143a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138.log.json) | +| YOLOv5-l | 512 | 32 | Yes | 10.0 | 73.1 | [config](./yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500-edc7e0d8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500.log.json) | + +**Note**: + +1. Training on VOC dataset need pretrained model which trained on COCO. +2. The performance is unstable and may fluctuate by about 0.4 mAP. +3. Official YOLOv5 use COCO metric, while training VOC dataset. +4. We converted the VOC test dataset to COCO format offline, while reproducing mAP result as shown above. We will support to use COCO metric while training VOC dataset in later version. +5. Hyperparameter reference from `https://wandb.ai/glenn-jocher/YOLOv5_VOC_official`. + +### CrowdHuman + +Since the `iscrowd` annotation of the COCO dataset is not equivalent to `ignore`, we use the CrowdHuman dataset to verify that the YOLOv5 ignore logic is correct. + +| Backbone | size | SyncBN | AMP | Mem (GB) | ignore_iof_thr | box AP50(CrowDHuman Metric) | MR | JI | Config | Download | +| :------: | :--: | :----: | :-: | :------: | :------------: | :-------------------------: | :--: | :---: | :------------------------------------------------------------------------: | :------: | +| YOLOv5-s | 640 | Yes | Yes | 2.6 | -1 | 85.79 | 48.7 | 75.33 | [config](./yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py) | | +| YOLOv5-s | 640 | Yes | Yes | 2.6 | 0.5 | 86.17 | 48.8 | 75.87 | [config](./yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py) | | + +**Note**: + +1. `ignore_iof_thr` is -1 indicating that the ignore tag is not considered. We adjusted with `ignore_iof_thr` thresholds of 0.5, 0.8, 0.9, and the results show that 0.5 has the best performance. +2. The above table shows the performance of the model with the best performance on the validation set. The best performing models are around 160+ epoch which means that there is no need to train so many epochs. +3. This is a very simple implementation that simply replaces COCO's anchor with the `tools/analysis_tools/optimize_anchors.py` script. We'll adjust other parameters later to improve performance. + +## Citation + +```latex +@software{glenn_jocher_2022_7002879, + author = {Glenn Jocher and + Ayush Chaurasia and + Alex Stoken and + Jirka Borovec and + NanoCode012 and + Yonghye Kwon and + TaoXie and + Kalen Michael and + Jiacong Fang and + imyhxy and + Lorna and + Colin Wong and + 曾逸夫(Zeng Yifu) and + Abhiram V and + Diego Montes and + Zhiqiang Wang and + Cristi Fati and + Jebastin Nadar and + Laughing and + UnglvKitDe and + tkianai and + yxNONG and + Piotr Skalski and + Adam Hogan and + Max Strobel and + Mrinal Jain and + Lorenzo Mammana and + xylieong}, + title = {{ultralytics/yolov5: v6.2 - YOLOv5 Classification + Models, Apple M1, Reproducibility, ClearML and + Deci.ai integrations}}, + month = aug, + year = 2022, + publisher = {Zenodo}, + version = {v6.2}, + doi = {10.5281/zenodo.7002879}, + url = {https://doi.org/10.5281/zenodo.7002879} +} +``` diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py new file mode 100644 index 0000000000000000000000000000000000000000..85b371929acd68bfd06cc257d20978c3fcc36db7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py @@ -0,0 +1,63 @@ +_base_ = 'yolov5_s-v61_fast_8xb16-300e_crowdhuman.py' + +model = dict( + data_preprocessor=dict( + _delete_=True, + type='mmdet.DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + bbox_head=dict(ignore_iof_thr=0.5)) + +img_scale = _base_.img_scale + +albu_train_transforms = [ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) +] + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + # only change this + dict(type='mmdet.LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(0.5, 1.5), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + collate_fn=dict(type='pseudo_collate'), + dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py new file mode 100644 index 0000000000000000000000000000000000000000..a61859fa0f2c0ea8a08ffd7783adc4ccac8540dd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py @@ -0,0 +1,47 @@ +_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# Use the model trained on the COCO as the pretrained model +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +# dataset settings +data_root = 'data/CrowdHuman/' +dataset_type = 'YOLOv5CrowdHumanDataset' + +# parameters that often need to be modified +num_classes = 1 + +anchors = [ + [(6, 14), (12, 28), (19, 48)], # P3/8 + [(29, 79), (46, 124), (142, 54)], # P4/16 + [(73, 198), (124, 330), (255, 504)] # P5/32 +] + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors))) + +train_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotation_train.odgt', + data_prefix=dict(img='Images/'))) + +val_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotation_val.odgt', + data_prefix=dict(img='Images/'), + # CrowdHumanMetric does not support out-of-order output images + # for the time being. batch_shapes_cfg does not support. + batch_shapes_cfg=None)) +test_dataloader = val_dataloader + +val_evaluator = dict( + _delete_=True, + type='mmdet.CrowdHumanMetric', + ann_file=data_root + 'annotation_val.odgt', + metric=['AP', 'MR', 'JI']) +test_evaluator = val_evaluator diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..6b27c7647bd233172e11df8e5a736946d70acfe0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,81 @@ +_base_ = './yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` +# ========================modified parameters====================== +deepen_factor = 1.0 +widen_factor = 1.0 + +mixup_prob = 0.1 +copypaste_prob = 0.1 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + max_aspect_ratio=_base_.max_aspect_ratio, + use_mask_refine=_base_.use_mask2refine), +] + +# enable mixup +train_pipeline = [ + *pre_transform, + *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + # TODO: support mask transform in albu + # Geometric transformations are not supported in albu now. + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=_base_.downsample_ratio, + mask_overlap=_base_.mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..831e815cb2f982e92c9995bd6e012bcce95950f6 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,89 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 + +affine_scale = 0.9 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + max_aspect_ratio=_base_.max_aspect_ratio, + use_mask_refine=_base_.use_mask2refine), +] + +# enable mixup +train_pipeline = [ + *pre_transform, + *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + # TODO: support mask transform in albu + # Geometric transformations are not supported in albu now. + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=_base_.downsample_ratio, + mask_overlap=_base_.mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..e06130bd317dba004a7fa1d5de0750f5b1cd21cf --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..82e2ae6d059df466940fc3df84ce53102ffec081 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py @@ -0,0 +1,42 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +data_root = 'data/balloon/' +# Path of train annotation file +train_ann_file = 'train.json' +train_data_prefix = 'train/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'val.json' +val_data_prefix = 'val/' # Prefix of val image path +metainfo = { + 'classes': ('balloon', ), + 'palette': [ + (220, 20, 60), + ] +} +num_classes = 1 + +train_batch_size_per_gpu = 4 +train_num_workers = 2 +log_interval = 1 +##################### +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=train_data_prefix), + ann_file=train_ann_file)) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) +test_dataloader = val_dataloader +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = val_evaluator +default_hooks = dict(logger=dict(interval=log_interval)) +##################### + +model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..0ab980ca7dfdd9c2feaba660f8745c92b49e6bbc --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,126 @@ +_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' # noqa + +# ========================modified parameters====================== +# YOLOv5RandomAffine +use_mask2refine = True +max_aspect_ratio = 100 +min_area_ratio = 0.01 +# Polygon2Mask +downsample_ratio = 4 +mask_overlap = True +# LeterResize +# half_pad_param: if set to True, left and right pad_param will +# be given by dividing padding_h by 2. If set to False, pad_param is +# in int format. We recommend setting this to False for object +# detection tasks, and True for instance segmentation tasks. +# Default to False. +half_pad_param = True + +# Testing take a long time due to model_test_cfg. +# If you want to speed it up, you can increase score_thr +# or decraese nms_pre and max_per_img +model_test_cfg = dict( + multi_label=True, + nms_pre=30000, + min_bbox_size=0, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=300, + mask_thr_binary=0.5, + # fast_test: Whether to use fast test methods. When set + # to False, the implementation here is the same as the + # official, with higher mAP. If set to True, mask will first + # be upsampled to origin image shape through Pytorch, and + # then use mask_thr_binary to determine which pixels belong + # to the object. If set to False, will first use + # mask_thr_binary to determine which pixels belong to the + # object , and then use opencv to upsample mask to origin + # image shape. Default to False. + fast_test=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + bbox_head=dict( + type='YOLOv5InsHead', + head_module=dict( + type='YOLOv5InsHeadModule', mask_channels=32, proto_channels=256), + mask_overlap=mask_overlap, + loss_mask=dict( + type='mmdet.CrossEntropyLoss', use_sigmoid=True, reduction='none'), + loss_mask_weight=0.05), + test_cfg=model_test_cfg) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=use_mask2refine) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + max_aspect_ratio=max_aspect_ratio, + use_mask_refine=use_mask2refine), + # TODO: support mask transform in albu + # Geometric transformations are not supported in albu now. + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=downsample_ratio, + mask_overlap=mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + half_pad_param=half_pad_param, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(metric=['bbox', 'segm']) +test_evaluator = val_evaluator diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..83b48cab69ade156f69864d11b37af597dd82da2 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py @@ -0,0 +1,49 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +# ========================modified parameters====================== +mask_overlap = False # Polygon2Mask + +# ===============================Unmodified in most cases==================== +model = dict(bbox_head=dict(mask_overlap=mask_overlap)) + +train_pipeline = [ + *_base_.pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + max_aspect_ratio=_base_.max_aspect_ratio, + use_mask_refine=True), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes', + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=_base_.downsample_ratio, + mask_overlap=mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..a18170ccc30c541f583ca3f4eaf829b853ed2816 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +deepen_factor = 1.33 +widen_factor = 1.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..206eec3c41542958ae105764fbf3991935b30bc8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,77 @@ +_base_ = './yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 1.0 +widen_factor = 1.0 + +mixup_prob = 0.1 +copypaste_prob = 0.1 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine), + dict(type='RemoveDataElement', keys=['gt_masks']) +] + +# enable mixup and copypaste +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..4af27a917e6113f33ff72781eeee911381bbed53 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,86 @@ +_base_ = './yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 + +affine_scale = 0.9 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine), + dict(type='RemoveDataElement', keys=['gt_masks']) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3fe8dc32ceaf687940596f6b8094d79857921deb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,20 @@ +_base_ = './yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 + +# ===============================Unmodified in most cases==================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..74febbb7764435d7ab4d9a8014fb6977a269da68 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,62 @@ +_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +use_mask2refine = True +min_area_ratio = 0.01 # YOLOv5RandomAffine + +# ===============================Unmodified in most cases==================== +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=use_mask2refine) +] + +last_transform = [ + # Delete gt_masks to avoid more computation + dict(type='RemoveDataElement', keys=['gt_masks']), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), + *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..fb76f1057872d81f52ac9369a689545194a61bb7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 1.33 +widen_factor = 1.25 + +# ===============================Unmodified in most cases==================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/metafile.yml b/models/YOLO-World/third_party/mmyolo/configs/yolov5/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..bfe5add4fa0f268a8a6566c7ddc2e9b46a92ffe7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/metafile.yml @@ -0,0 +1,346 @@ +Collections: + - Name: YOLOv5 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - CSPDarkNet + - PAFPN + README: configs/yolov5/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.1.0/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.0 + - Name: YOLOv5_VOC + Metadata: + Training Data: VOC + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + Training Resources: 1x A100 GPU + Architecture: + - CSPDarkNet + - PAFPN + README: configs/yolov5/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.1.0/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.0 + +Models: + - Name: yolov5_n-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 1.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 28.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth + - Name: yolov5_s-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 2.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth + - Name: yolov5_m-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth + - Name: yolov5_l-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 8.1 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.8 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth + - Name: yolov5_x-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco/yolov5_x-v61_syncbn_fast_8xb16-300e_coco_20230305_152943-00776a4b.pth + - Name: yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 5.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705-d493c5f3.pth + - Name: yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 10.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044-58865c19.pth + - Name: yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 19.1 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453-49564d58.pth + - Name: yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 30.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 53.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308-7a2ba6bf.pth + - Name: yolov5_n-v61_fast_1xb64-50e_voc + In Collection: YOLOv5_VOC + Config: configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py + Metadata: + Training Memory (GB): 3.5 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254-f1493430.pth + - Name: yolov5_s-v61_fast_1xb64-50e_voc + In Collection: YOLOv5_VOC + Config: configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py + Metadata: + Training Memory (GB): 6.5 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 62.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156-0009b33e.pth + - Name: yolov5_m-v61_fast_1xb64-50e_voc + In Collection: YOLOv5_VOC + Config: configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py + Metadata: + Training Memory (GB): 12.0 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 70.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138-815c143a.pth + - Name: yolov5_l-v61_fast_1xb32-50e_voc + In Collection: YOLOv5_VOC + Config: configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py + Metadata: + Training Memory (GB): 10.0 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 73.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500-edc7e0d8.pth + - Name: yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 1.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 28.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_152706-712fb1b2.pth + - Name: yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 2.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230304_033134-8e0cd271.pth + - Name: yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_153946-44e96155.pth + - Name: yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 8.1 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154301-2c1d912a.pth + - Name: yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321-07edeb62.pth + - Name: yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 3.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 27.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 23.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807-84cc9240.pth + - Name: yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 32.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542-3e570436.pth + - Name: yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 32.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642-6780d34e.pth + - Name: yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 7.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529-ef5ba1a9.pth + - Name: yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_104049-daa09f70.pth + - Name: yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 15.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_103925-a260c798.pth diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..4b470973c46073748803bac2f736eca615e3cb00 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py @@ -0,0 +1,25 @@ +_base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' + +deepen_factor = 1.0 +widen_factor = 1.0 +train_batch_size_per_gpu = 32 +train_num_workers = 8 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth' # noqa + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, num_workers=train_num_workers) + +optim_wrapper = dict( + optimizer=dict(batch_size_per_gpu=train_batch_size_per_gpu)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed2127a19854fde1b6fa0c80f4d6fd2ba818f0a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py @@ -0,0 +1,17 @@ +_base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' + +deepen_factor = 0.67 +widen_factor = 0.75 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth' # noqa + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..041f6537d03a4f13402b1bb7e2665443793e4681 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py @@ -0,0 +1,17 @@ +_base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' + +deepen_factor = 0.33 +widen_factor = 0.25 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth' # noqa + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..f777fff9697dfbd315a0b8f762a2bf31a1118ca8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py @@ -0,0 +1,270 @@ +_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# dataset settings +data_root = 'data/VOCdevkit/' +dataset_type = 'YOLOv5VOCDataset' + +# parameters that often need to be modified +num_classes = 20 +img_scale = (512, 512) # width, height +max_epochs = 50 +train_batch_size_per_gpu = 64 +train_num_workers = 8 +val_batch_size_per_gpu = 1 +val_num_workers = 2 + +# persistent_workers must be False if num_workers is 0. +persistent_workers = True + +lr_factor = 0.15135 +affine_scale = 0.75544 + +# only on Val +batch_shapes_cfg = dict(img_size=img_scale[0]) + +anchors = [[(26, 44), (67, 57), (61, 130)], [(121, 118), (120, 239), + (206, 182)], + [(376, 161), (234, 324), (428, 322)]] +num_det_layers = 3 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +tta_img_scales = [img_scale, (416, 416), (640, 640)] + +# Hyperparameter reference from: +# https://github.com/ultralytics/yolov5/blob/master/data/hyps/hyp.VOC.yaml +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict( + loss_weight=0.21638 * (num_classes / 80 * 3 / num_det_layers), + class_weight=0.5), + loss_bbox=dict(loss_weight=0.02 * (3 / num_det_layers)), + loss_obj=dict( + loss_weight=0.51728 * + ((img_scale[0] / 640)**2 * 3 / num_det_layers), + class_weight=0.67198), + # Different from COCO + prior_match_thr=3.3744), + test_cfg=dict(nms=dict(iou_threshold=0.6))) + +albu_train_transforms = _base_.albu_train_transforms +pre_transform = _base_.pre_transform + +with_mosiac_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.04591, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='YOLOv5MixUp', + prob=0.04266, + pre_transform=[ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.04591, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) + ]) +] + +without_mosaic_pipeline = [ + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.04591, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + border=(0, 0), + border_val=(114, 114, 114)), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114)) +] + +# Because the border parameter is inconsistent when +# using mosaic or not, `RandomChoice` is used here. +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[with_mosiac_pipeline, without_mosaic_pipeline], + prob=[0.85834, 0.14166]) + +train_pipeline = [ + *pre_transform, randchoice_mosaic_pipeline, + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict( + type='YOLOv5HSVRandomAug', + hue_delta=0.01041, + saturation_delta=0.54703, + value_delta=0.27739), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + _delete_=True, + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='ConcatDataset', + datasets=[ + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline), + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2012/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2012/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + ], + # Use ignore_keys to avoid judging metainfo is + # not equal in `ConcatDataset`. + ignore_keys='dataset_type'), + collate_fn=dict(type='yolov5_collate')) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2007/ImageSets/Main/test.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + test_mode=True, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + optimizer=dict( + lr=0.00334, + momentum=0.74832, + weight_decay=0.00025, + batch_size_per_gpu=train_batch_size_per_gpu)) + +default_hooks = dict( + param_scheduler=dict( + lr_factor=lr_factor, + max_epochs=max_epochs, + warmup_epochs=3.3835, + warmup_momentum=0.59462, + warmup_bias_lr=0.18657)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + # To load COCO pretrained model, need to set `strict_load=False` + strict_load=False, + priority=49) +] + +# TODO: Support using coco metric in voc dataset +val_evaluator = dict( + _delete_=True, type='mmdet.VOCMetric', metric='mAP', eval_mode='area') + +test_evaluator = val_evaluator + +train_cfg = dict(max_epochs=max_epochs) + +# Config for Test Time Augmentation. (TTA) +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in tta_img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_x-v61_fast_1xb32-50e_voc.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_x-v61_fast_1xb32-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..2fc4d79f86b40c45d3f7692f32adc88295bbb4a4 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/voc/yolov5_x-v61_fast_1xb32-50e_voc.py @@ -0,0 +1,26 @@ +_base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' + +deepen_factor = 1.33 +widen_factor = 1.25 +train_batch_size_per_gpu = 32 +train_num_workers = 8 + +# TODO: need to add pretrained_model +load_from = None + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, num_workers=train_num_workers) + +optim_wrapper = dict( + optimizer=dict(batch_size_per_gpu=train_batch_size_per_gpu)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..6a84fdbebc11dd4eafadc34be1e98bfb6f9b2f43 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py' + +deepen_factor = 1.0 +widen_factor = 1.0 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..60a11a375c3dd8ead1d3f6a04340aed2acb20b20 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py' + +deepen_factor = 1.0 +widen_factor = 1.0 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f593e378a9fbbf1381e48a186a645a559b1f129a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,79 @@ +_base_ = './yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +affine_scale = 0.9 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d2ef324ed097a30d5a04fba2bb85641e7857f353 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,79 @@ +_base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +affine_scale = 0.9 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3cd2d6b7be817f4f8e6729acc1d3f9e450457e07 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = 'yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py' + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b6f93428fc8d6dc1b94a8d447671ffc1a877dbb8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f64df69fd4ea0f4c8d30b9e8928bcd1c4e1d9d35 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,138 @@ +_base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +img_scale = (1280, 1280) # width, height +num_classes = 80 # Number of classes for classification +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + img_size=img_scale[0], + # The image scale of padding should be divided by pad_size_divisor + size_divisor=64) +# Basic size of multi-scale prior box +anchors = [ + [(19, 27), (44, 40), (38, 94)], # P3/8 + [(96, 68), (86, 152), (180, 137)], # P4/16 + [(140, 301), (303, 264), (238, 542)], # P5/32 + [(436, 615), (739, 380), (925, 792)] # P6/64 +] +# Strides of multi-scale prior box +strides = [8, 16, 32, 64] +num_det_layers = 4 # The number of model output scales +loss_cls_weight = 0.5 +loss_bbox_weight = 0.05 +loss_obj_weight = 1.0 +# The obj loss weights of the three output layers +obj_level_weights = [4.0, 1.0, 0.25, 0.06] +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio + +tta_img_scales = [(1280, 1280), (1024, 1024), (1536, 1536)] +# =======================Unmodified in most cases================== +model = dict( + backbone=dict(arch='P6', out_indices=(2, 3, 4, 5)), + neck=dict( + in_channels=[256, 512, 768, 1024], out_channels=[256, 512, 768, 1024]), + bbox_head=dict( + head_module=dict( + in_channels=[256, 512, 768, 1024], featmap_strides=strides), + prior_generator=dict(base_sizes=anchors, strides=strides), + # scaled based on number of detection layers + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_bbox=dict(loss_weight=loss_bbox_weight * (3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), + obj_level_weights=obj_level_weights)) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +# Config for Test Time Augmentation. (TTA) +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in tta_img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..5bbd13e0859abb7a9fa315a8b0f956f959a560d7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py @@ -0,0 +1,70 @@ +_base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py' + +# This configuration is used to provide non-square training examples +# Must be a multiple of 32 +img_scale = (608, 352) # w h + +anchors = [ + [(65, 35), (159, 45), (119, 80)], # P3/8 + [(215, 77), (224, 116), (170, 166)], # P4/16 + [(376, 108), (339, 176), (483, 190)] # P5/32 +] + +# ===============================Unmodified in most cases==================== +_base_.model.bbox_head.loss_obj.loss_weight = 1.0 * ((img_scale[1] / 640)**2) +_base_.model.bbox_head.prior_generator.base_sizes = anchors + +train_pipeline = [ + *_base_.pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +_base_.train_dataloader.dataset.pipeline = train_pipeline + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='mmdet.LoadAnnotations', with_bbox=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) +test_dataloader = val_dataloader diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..7b7e4f227bbc6aa37873dc306009d1af842c166c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py @@ -0,0 +1,56 @@ +_base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # The warmup_mim_iter parameter is critical. + # The default value is 1000 which is not suitable for cat datasets. + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..dc460fa9802d34ece214482bcda7a6bdf7435b39 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py @@ -0,0 +1,13 @@ +_base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py' + +model = dict( + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1) + ])) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d8238c1377cb2f56f4c3bf0c5cd6d4227b2d70a5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py @@ -0,0 +1,23 @@ +_base_ = 'yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + use_mini_pad=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) +test_dataloader = val_dataloader + +model = dict( + test_cfg=dict( + multi_label=False, score_thr=0.25, nms=dict(iou_threshold=0.45))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..7e81a0385587df40c588dcb44202a7f5d82478c1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py @@ -0,0 +1,292 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 16 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----model related----- +# Basic size of multi-scale prior box +anchors = [ + [(10, 13), (16, 30), (33, 23)], # P3/8 + [(30, 61), (62, 45), (59, 119)], # P4/16 + [(116, 90), (156, 198), (373, 326)] # P5/32 +] + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=128 bs +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.65), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + # The image scale of padding should be divided by pad_size_divisor + size_divisor=32, + # Additional paddings for pixel scale + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 +# Strides of multi-scale prior box +strides = [8, 16, 32] +num_det_layers = 3 # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) # Normalization config + +# -----train val related----- +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +loss_cls_weight = 0.5 +loss_bbox_weight = 0.05 +loss_obj_weight = 1.0 +prior_match_thr = 4. # Priori box matching threshold +# The obj loss weights of the three output layers +obj_level_weights = [4., 1., 0.4] +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.0005 +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='mmdet.DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv5CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=[256, 512, 1024], + num_csp_blocks=3, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + num_classes=num_classes, + in_channels=[256, 512, 1024], + widen_factor=widen_factor, + featmap_strides=strides, + num_base_priors=3), + prior_generator=dict( + type='mmdet.YOLOAnchorGenerator', + base_sizes=anchors, + strides=strides), + # scaled based on number of detection layers + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xywh', + eps=1e-7, + reduction='mean', + loss_weight=loss_bbox_weight * (3 / num_det_layers), + return_iou=True), + loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), + prior_match_thr=prior_match_thr, + obj_level_weights=obj_level_weights), + test_cfg=model_test_cfg) + +albu_train_transforms = [ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) +] + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='linear', + lr_factor=lr_factor, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + save_best='auto', + max_keep_ckpts=max_keep_ckpts)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py new file mode 100644 index 0000000000000000000000000000000000000000..2c585ceb92e9bfb1984b49ce02f86f4d3cd4532d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py @@ -0,0 +1,42 @@ +_base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +data_root = 'data/balloon/' +# Path of train annotation file +train_ann_file = 'train.json' +train_data_prefix = 'train/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'val.json' +val_data_prefix = 'val/' # Prefix of val image path +metainfo = { + 'classes': ('balloon', ), + 'palette': [ + (220, 20, 60), + ] +} +num_classes = 1 + +train_batch_size_per_gpu = 4 +train_num_workers = 2 +log_interval = 1 + +# =======================Unmodified in most cases================== +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=train_data_prefix), + ann_file=train_ann_file)) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) +test_dataloader = val_dataloader +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = val_evaluator +model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes))) +default_hooks = dict(logger=dict(interval=log_interval)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..17b4a73b092fda1b98a088a83619697702859f71 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,12 @@ +_base_ = 'yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# fast means faster training speed, +# but less flexibility for multitasking +model = dict( + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True)) + +train_dataloader = dict(collate_fn=dict(type='yolov5_collate')) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..9fe5c0103520280ba26bb3f56a4a30658576b74b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,14 @@ +_base_ = './yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py' +deepen_factor = 1.33 +widen_factor = 1.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8782eed8df6318b3aad6333809a04f639fd0cefb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,14 @@ +_base_ = './yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py' +deepen_factor = 1.33 +widen_factor = 1.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..60c11feb3d4e6f8db5f3e70af5d3afdbc5f65535 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,59 @@ +_base_ = './yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 1.00 +widen_factor = 1.00 + +mixup_prob = 0.15 +copypaste_prob = 0.3 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform +affine_scale = _base_.affine_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..22b9e881d024bfc781b1328913b50439ac80a2f3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,18 @@ +_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +# TODO: Update the training hyperparameters +deepen_factor = 1.0 +widen_factor = 1.0 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ecc86fdd2d9ae362477f4edc5e5f9dd497222946 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,79 @@ +_base_ = './yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +affine_scale = 0.9 +mixup_prob = 0.1 +copypaste_prob = 0.1 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..0cfb332488ba41c5e0880bd91d8c73fccde52f36 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,18 @@ +_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +# TODO: Update the training hyperparameters +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..1ca21b65147e830b04b0e70e61011f6a9371d637 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,20 @@ +_base_ = './yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 + +# ===============================Unmodified in most cases==================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ad6a9f2eba7ac8fc56c12fab52a3a8f9b24acba1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d6840bc288b2cb9d26ebc06d0b888926035ce8b9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,80 @@ +_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +use_mask2refine = True +min_area_ratio = 0.01 # YOLOv5RandomAffine + +# ===============================Unmodified in most cases==================== +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=use_mask2refine) +] + +last_transform = [ + # Delete gt_masks to avoid more computation + dict(type='RemoveDataElement', keys=['gt_masks']), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..81d3a981c281af0f4cd9596c4a7349cb2e1bf367 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,326 @@ +_base_ = ['../../_base_/default_runtime.py', '../../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 16 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=128 bs +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs +# Disable mosaic augmentation for final 10 epochs (stage 2) +close_mosaic_epochs = 10 + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.7), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + # The image scale of padding should be divided by pad_size_divisor + size_divisor=32, + # Additional paddings for pixel scale + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 +# Strides of multi-scale prior box +strides = [8, 16, 32] +num_det_layers = 3 # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) # Normalization config + +# -----train val related----- +tal_topk = 10 # Number of bbox selected in each level +tal_alpha = 0.5 # A Hyper-parameter related to alignment_metrics +tal_beta = 6.0 # A Hyper-parameter related to alignment_metrics + +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +# YOLOv5RandomAffine aspect ratio of width and height thres to filter bboxes +max_aspect_ratio = 100 +# TODO: Automatically scale loss_weight based on number of detection layers +loss_cls_weight = 0.5 +loss_bbox_weight = 7.5 +# Since the dfloss is implemented differently in the official +# and mmdet, we're going to divide loss_weight by 4. +loss_dfl_weight = 1.5 / 4 +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.001 +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv5CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=[256, 512, 1024], + num_csp_blocks=3, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOv8Head', + head_module=dict( + type='YOLOv8HeadModule', + num_classes=num_classes, + in_channels=[256, 512, 1024], + widen_factor=widen_factor, + reg_max=16, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + featmap_strides=strides), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0.5, strides=strides), + bbox_coder=dict(type='DistancePointBBoxCoder'), + # scaled based on number of detection layers + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none', + loss_weight=loss_cls_weight), + loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xyxy', + reduction='sum', + loss_weight=loss_bbox_weight, + return_iou=False), + loss_dfl=dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=loss_dfl_weight)), + train_cfg=dict( + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + use_ciou=True, + topk=tal_topk, + alpha=tal_alpha, + beta=tal_beta, + eps=1e-9)), + test_cfg=model_test_cfg) + +albu_train_transforms = [ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) +] + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +last_transform = [ + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=max_aspect_ratio, + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=max_aspect_ratio, + border_val=(114, 114, 114)), *last_transform +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='yolov5_collate'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='linear', + lr_factor=lr_factor, + max_epochs=max_epochs, + warmup_epochs=3.0, + warmup_momentum=0.8, + warmup_bias_lr=0.1), + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + save_best='auto', + max_keep_ckpts=max_keep_ckpts)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..33092aa6a47e6053c8ce83dcdf820828619077bc --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.33 +widen_factor = 1.25 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..fd471fd46f3e19c4e0a4176703d4ab5eeee3aa0b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,18 @@ +_base_ = './yolov5u_l_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +# TODO: Update the training hyperparameters +deepen_factor = 1.33 +widen_factor = 1.25 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/README.md b/models/YOLO-World/third_party/mmyolo/configs/yolov6/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7ecda276988ff87702e902be8799d85b2dfdc79f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/README.md @@ -0,0 +1,53 @@ +# YOLOv6 + +> [YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications](https://arxiv.org/abs/2209.02976) + + + +## Abstract + +For years, YOLO series have been de facto industry-level standard for efficient object detection. The YOLO community has prospered overwhelmingly to enrich its use in a multitude of hardware platforms and abundant scenarios. In this technical report, we strive to push its limits to the next level, stepping forward with an unwavering mindset for industry application. Considering the diverse requirements for speed and accuracy in the real environment, we extensively examine the up-to-date object detection advancements either from industry or academy. Specifically, we heavily assimilate ideas from recent network design, training strategies, testing techniques, quantization and optimization methods. On top of this, we integrate our thoughts and practice to build a suite of deployment-ready networks at various scales to accommodate diversified use cases. With the generous permission of YOLO authors, we name it YOLOv6. We also express our warm welcome to users and contributors for further enhancement. For a glimpse of performance, our YOLOv6-N hits 35.9% AP on COCO dataset at a throughput of 1234 FPS on an NVIDIA Tesla T4 GPU. YOLOv6-S strikes 43.5% AP at 495 FPS, outperforming other mainstream detectors at the same scale (YOLOv5-S, YOLOX-S and PPYOLOE-S). Our quantized version of YOLOv6-S even brings a new state-of-the-art 43.3% AP at 869 FPS. Furthermore, YOLOv6-M/L also achieves better accuracy performance (i.e., 49.5%/52.3%) than other detectors with the similar inference speed. We carefully conducted experiments to validate the effectiveness of each component. + +
+ +
+ +
+YOLOv6-s +YOLOv6-s model structure +
+ +
+YOLOv6-l +YOLOv6-l model structure +
+ +## Results and models + +### COCO + +| Backbone | Arch | Size | Epoch | SyncBN | AMP | Mem (GB) | Box AP | Config | Download | +| :------: | :--: | :--: | :---: | :----: | :-: | :------: | :----: | :-------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv6-n | P5 | 640 | 400 | Yes | Yes | 6.04 | 36.2 | [config](./yolov6_n_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726-d99b2e82.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726.log.json) | +| YOLOv6-t | P5 | 640 | 400 | Yes | Yes | 8.13 | 41.0 | [config](./yolov6_t_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755-cf0d278f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755.log.json) | +| YOLOv6-s | P5 | 640 | 400 | Yes | Yes | 8.88 | 44.0 | [config](./yolov6_s_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035.log.json) | +| YOLOv6-m | P5 | 640 | 300 | Yes | Yes | 16.69 | 48.4 | [config](./yolov6_m_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658.log.json) | +| YOLOv6-l | P5 | 640 | 300 | Yes | Yes | 20.86 | 51.0 | [config](./yolov6_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156-91e3c447.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156.log.json) | + +**Note**: + +1. The official m and l models use knowledge distillation, but our version does not support it, which will be implemented in [MMRazor](https://github.com/open-mmlab/mmrazor) in the future. +2. The performance is unstable and may fluctuate by about 0.3 mAP. +3. If users need the weight of 300 epoch for nano, tiny and small model, they can train according to the configs of 300 epoch provided by us, or convert the official weight according to the [converter script](../../tools/model_converters/). +4. We have observed that the [base model](https://github.com/meituan/YOLOv6/tree/main/configs/base) has been officially released in v6 recently. Although the accuracy has decreased, it is more efficient. We will also provide the base model configuration in the future. + +## Citation + +```latex +@article{li2022yolov6, + title={YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications}, + author={Li, Chuyi and Li, Lulu and Jiang, Hongliang and Weng, Kaiheng and Geng, Yifei and Li, Liang and Ke, Zaidan and Li, Qingyuan and Cheng, Meng and Nie, Weiqiang and others}, + journal={arXiv preprint arXiv:2209.02976}, + year={2022} +} +``` diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/metafile.yml b/models/YOLO-World/third_party/mmyolo/configs/yolov6/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..df451526957c08d5956db33fe5e180cd7d5fcd66 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/metafile.yml @@ -0,0 +1,83 @@ +Collections: + - Name: YOLOv6 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - CSPDarkNet + - PAFPN + - RepVGG + Paper: + URL: https://arxiv.org/abs/2209.02976 + Title: 'YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications' + README: configs/yolov6/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.0.1/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.0.1 + +Models: + - Name: yolov6_s_syncbn_fast_8xb32-400e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py + Metadata: + Training Memory (GB): 8.88 + Epochs: 400 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth + - Name: yolov6_n_syncbn_fast_8xb32-400e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py + Metadata: + Training Memory (GB): 6.04 + Epochs: 400 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726-d99b2e82.pth + - Name: yolov6_t_syncbn_fast_8xb32-400e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py + Metadata: + Training Memory (GB): 8.13 + Epochs: 400 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755-cf0d278f.pth + - Name: yolov6_m_syncbn_fast_8xb32-300e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 16.69 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth + - Name: yolov6_l_syncbn_fast_8xb32-300e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 20.86 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156-91e3c447.pth diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ad5ecf347e4aa0b3194b8be33d9c294915dd9e56 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,28 @@ +_base_ = './yolov6_m_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 1 +# The scaling factor that controls the width of the network structure +widen_factor = 1 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + block_act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..09811c8c06fb81a061ac4da7904c8d7d1e248411 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,62 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.6 +# The scaling factor that controls the width of the network structure +widen_factor = 0.75 + +# -----train val related----- +affine_scale = 0.9 # YOLOv5RandomAffine scaling ratio + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict( + type='YOLOv6CSPBep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=2. / 3, + block_cfg=dict(type='RepVGGBlock'), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6CSPRepPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + block_cfg=dict(type='RepVGGBlock'), + hidden_ratio=2. / 3, + block_act_cfg=dict(type='ReLU', inplace=True)), + bbox_head=dict( + type='YOLOv6Head', head_module=dict(widen_factor=widen_factor))) + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..bc2db4b6c03277a7c62ba3ed505d54f54267328f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.25 + +# -----train val related----- +lr_factor = 0.02 # Learning rate scaling factor + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) + +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f66aa15fc447bce5f510a60bdda1914a8a7b5a76 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.25 + +# -----train val related----- +lr_factor = 0.02 # Learning rate scaling factor + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) + +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_s_fast_1xb12-40e_cat.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_s_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..82578fccf7fffb8e4bb4ac21170543a7f71bc63e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_s_fast_1xb12-40e_cat.py @@ -0,0 +1,56 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 +num_last_epochs = 5 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu +_base_.custom_hooks[1].switch_epoch = max_epochs - num_last_epochs + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # The warmup_mim_iter parameter is critical. + # The default value is 1000 which is not suitable for cat datasets. + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict( + max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..dbffaeb3362883d8a70f43c0722dd6c99b8b8352 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,33 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +# ======================= Frequently modified parameters ===================== +# -----train val related----- +# Base learning rate for optim_wrapper +max_epochs = 300 # Maximum training epochs +num_last_epochs = 15 # Last epoch number to switch training pipeline + +# ============================== Unmodified in most cases =================== +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='cosine', + lr_factor=0.01, + max_epochs=max_epochs)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_last_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] + +train_cfg = dict( + max_epochs=max_epochs, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..eb564c07a906185f6702aac88cbb4d53493f168c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py @@ -0,0 +1,280 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ======================= Frequently modified parameters ===================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 32 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper +base_lr = 0.01 +max_epochs = 400 # Maximum training epochs +num_last_epochs = 15 # Last epoch number to switch training pipeline + +# ======================= Possible modified parameters ======================= +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + size_divisor=32, + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 + +# -----train val related----- +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.0005 +# Save model checkpoint and validation intervals +save_epoch_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ============================== Unmodified in most cases =================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv6EfficientRep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6RepPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=[128, 256, 512], + num_csp_blocks=12, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True), + ), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict( + type='YOLOv6HeadModule', + num_classes=num_classes, + in_channels=[128, 256, 512], + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='SiLU', inplace=True), + featmap_strides=[8, 16, 32]), + loss_bbox=dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False)), + train_cfg=dict( + initial_epoch=4, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=num_classes, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + topk=13, + alpha=1, + beta=6), + ), + test_cfg=dict( + multi_label=True, + nms_pre=30000, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=300)) + +# The training pipeline of YOLOv6 is basically the same as YOLOv5. +# The difference is that Mosaic and RandomAffine will be closed in the last 15 epochs. # noqa +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.1, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + max_shear_degree=0.0), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.1, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_shear_degree=0.0, + ), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + collate_fn=dict(type='yolov5_collate'), + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +# Optimizer and learning rate scheduler of YOLOv6 are basically the same as YOLOv5. # noqa +# The difference is that the scheduler_type of YOLOv6 is cosine. +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='cosine', + lr_factor=lr_factor, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto')) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_last_epochs, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..aa9da63f6984a9a23bc7ca78780db5be5a782399 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.375 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..75755555a58b45309df9213b6262cee030e41a9d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.375 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed4b05538c077d6f49036c6399942d5f8b3f627 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,28 @@ +_base_ = './yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 1 +# The scaling factor that controls the width of the network structure +widen_factor = 1 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + block_act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..982b0c8865a557c9970c1f50e3b84acba89bf93f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,63 @@ +_base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.6 +# The scaling factor that controls the width of the network structure +widen_factor = 0.75 + +# -----train val related----- +affine_scale = 0.9 # YOLOv5RandomAffine scaling ratio + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict( + type='YOLOv6CSPBep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=2. / 3, + block_cfg=dict(type='RepVGGBlock'), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6CSPRepBiPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + block_cfg=dict(type='RepVGGBlock'), + hidden_ratio=2. / 3, + block_act_cfg=dict(type='ReLU', inplace=True)), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict(reg_max=16, widen_factor=widen_factor))) + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..96469f026e253b76a293f8f3ef81148af5d258a8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.25 + +# -----train val related----- +lr_factor = 0.02 # Learning rate scaling factor + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) + +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8b0ad190139fa199918752cb8b531352db942fc0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,282 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ======================= Frequently modified parameters ===================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 32 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs +num_last_epochs = 15 # Last epoch number to switch training pipeline + +# ======================= Possible modified parameters ======================= +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + size_divisor=32, + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 + +# -----train val related----- +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.0005 +# Save model checkpoint and validation intervals +save_epoch_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ============================== Unmodified in most cases =================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv6EfficientRep', + out_indices=[1, 2, 3, 4], + use_cspsppf=True, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6RepBiPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[128, 256, 512, 1024], + out_channels=[128, 256, 512], + num_csp_blocks=12, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True), + ), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict( + type='YOLOv6HeadModule', + num_classes=num_classes, + in_channels=[128, 256, 512], + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='SiLU', inplace=True), + featmap_strides=[8, 16, 32]), + loss_bbox=dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False)), + train_cfg=dict( + initial_epoch=4, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=num_classes, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + topk=13, + alpha=1, + beta=6), + ), + test_cfg=dict( + multi_label=True, + nms_pre=30000, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=300)) + +# The training pipeline of YOLOv6 is basically the same as YOLOv5. +# The difference is that Mosaic and RandomAffine will be closed in the last 15 epochs. # noqa +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.1, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + max_shear_degree=0.0), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.1, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_shear_degree=0.0, + ), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + collate_fn=dict(type='yolov5_collate'), + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +# Optimizer and learning rate scheduler of YOLOv6 are basically the same as YOLOv5. # noqa +# The difference is that the scheduler_type of YOLOv6 is cosine. +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='cosine', + lr_factor=lr_factor, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto')) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_last_epochs, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d088b6b6629345f6f086f67373206b6d6f9b7e31 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.375 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/README.md b/models/YOLO-World/third_party/mmyolo/configs/yolov7/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f8f87f8358e25b7c8004aabfe7229d7941b6919a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/README.md @@ -0,0 +1,50 @@ +# YOLOv7 + +> [YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors](https://arxiv.org/abs/2207.02696) + + + +## Abstract + +YOLOv7 surpasses all known object detectors in both speed and accuracy in the range from 5 FPS to 160 FPS and has the highest accuracy 56.8% AP among all known real-time object detectors with 30 FPS or higher on GPU V100. YOLOv7-E6 object detector (56 FPS V100, 55.9% AP) outperforms both transformer-based detector SWIN-L Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by 509% in speed and 2% in accuracy, and convolutional-based detector ConvNeXt-XL Cascade-Mask R-CNN (8.6 FPS A100, 55.2% AP) by 551% in speed and 0.7% AP in accuracy, as well as YOLOv7 outperforms: YOLOR, YOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable DETR, DINO-5scale-R50, ViT-Adapter-B and many other object detectors in speed and accuracy. Moreover, we train YOLOv7 only on MS COCO dataset from scratch without using any other datasets or pre-trained weights. Source code is released in [this https URL](https://github.com/WongKinYiu/yolov7). + +
+ +
+ +
+YOLOv7-l +YOLOv7-l-P5 model structure +
+ +## Results and models + +### COCO + +| Backbone | Arch | Size | SyncBN | AMP | Mem (GB) | Box AP | Config | Download | +| :---------: | :--: | :--: | :----: | :-: | :------: | :----: | :----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv7-tiny | P5 | 640 | Yes | Yes | 2.7 | 37.5 | [config](./yolov7_tiny_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719.log.json) | +| YOLOv7-l | P5 | 640 | Yes | Yes | 10.3 | 50.9 | [config](./yolov7_l_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601.log.json) | +| YOLOv7-x | P5 | 640 | Yes | Yes | 13.7 | 52.8 | [config](./yolov7_x_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331.log.json) | +| YOLOv7-w | P6 | 1280 | Yes | Yes | 27.0 | 54.1 | [config](./yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031-a68ef9d2.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031.log.json) | +| YOLOv7-e | P6 | 1280 | Yes | Yes | 42.5 | 55.1 | [config](./yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636-34425033.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636.log.json) | + +**Note**: +In the official YOLOv7 code, the `random_perspective` data augmentation in COCO object detection task training uses mask annotation information, which leads to higher performance. Object detection should not use mask annotation, so only box annotation information is used in `MMYOLO`. We will use the mask annotation information in the instance segmentation task. + +1. The performance is unstable and may fluctuate by about 0.3 mAP. The performance shown above is the best model. +2. If users need the weight of `YOLOv7-e2e`, they can train according to the configs provided by us, or convert the official weight according to the [converter script](https://github.com/open-mmlab/mmyolo/blob/main/tools/model_converters/yolov7_to_mmyolo.py). +3. `fast` means that `YOLOv5DetDataPreprocessor` and `yolov5_collate` are used for data preprocessing, which is faster for training, but less flexible for multitasking. Recommended to use fast version config if you only care about object detection. +4. `SyncBN` means use SyncBN, `AMP` indicates training with mixed precision. +5. We use 8x A100 for training, and the single-GPU batch size is 16. This is different from the official code. + +## Citation + +```latex +@article{wang2022yolov7, + title={{YOLOv7}: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors}, + author={Wang, Chien-Yao and Bochkovskiy, Alexey and Liao, Hong-Yuan Mark}, + journal={arXiv preprint arXiv:2207.02696}, + year={2022} +} +``` diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/metafile.yml b/models/YOLO-World/third_party/mmyolo/configs/yolov7/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..067ec6b45afefa2ae444b0343ad327b94f1507d2 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/metafile.yml @@ -0,0 +1,83 @@ +Collections: + - Name: YOLOv7 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - EELAN + - PAFPN + - RepVGG + Paper: + URL: https://arxiv.org/abs/2207.02696 + Title: 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors' + README: configs/yolov7/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.0.1/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.0.1 + +Models: + - Name: yolov7_tiny_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 2.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.5 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth + - Name: yolov7_l_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 10.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth + - Name: yolov7_x_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 13.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.8 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth + - Name: yolov7_w-p6_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 27.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 54.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031-a68ef9d2.pth + - Name: yolov7_e-p6_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 42.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 55.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636-34425033.pth diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..a68715264d59c16ef2b31010ede44310d97a3a7e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py' + +model = dict( + backbone=dict(arch='D'), + neck=dict( + use_maxpool_in_downsample=True, + use_in_channels_in_downsample=True, + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.2, + num_blocks=6, + num_convs_in_block=1), + in_channels=[384, 768, 1152, 1536], + out_channels=[192, 384, 576, 768]), + bbox_head=dict( + head_module=dict( + in_channels=[192, 384, 576, 768], + main_out_channels=[384, 768, 1152, 1536], + aux_out_channels=[384, 768, 1152, 1536], + ))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3d1463dc487e05eabfd3f586a28262017a9dc566 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,19 @@ +_base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py' + +model = dict( + backbone=dict(arch='E'), + neck=dict( + use_maxpool_in_downsample=True, + use_in_channels_in_downsample=True, + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.2, + num_blocks=6, + num_convs_in_block=1), + in_channels=[320, 640, 960, 1280], + out_channels=[160, 320, 480, 640]), + bbox_head=dict( + head_module=dict( + in_channels=[160, 320, 480, 640], + main_out_channels=[320, 640, 960, 1280]))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..6af81051b72977410d5b51cf7a02a476d55ceb24 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,20 @@ +_base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py' + +model = dict( + backbone=dict(arch='E2E'), + neck=dict( + use_maxpool_in_downsample=True, + use_in_channels_in_downsample=True, + block_cfg=dict( + type='EELANBlock', + num_elan_block=2, + middle_ratio=0.4, + block_ratio=0.2, + num_blocks=6, + num_convs_in_block=1), + in_channels=[320, 640, 960, 1280], + out_channels=[160, 320, 480, 640]), + bbox_head=dict( + head_module=dict( + in_channels=[160, 320, 480, 640], + main_out_channels=[320, 640, 960, 1280]))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..e8a756c27e5366e3a83658132b0e330a5f68ad22 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,324 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 16 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----model related----- +# Basic size of multi-scale prior box +anchors = [ + [(12, 16), (19, 36), (40, 28)], # P3/8 + [(36, 75), (76, 55), (72, 146)], # P4/16 + [(142, 110), (192, 243), (459, 401)] # P5/32 +] +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=128 bs +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs + +num_epoch_stage2 = 30 # The last 30 epochs switch evaluation interval +val_interval_stage2 = 1 # Evaluation interval + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS. + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.65), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + # The image scale of padding should be divided by pad_size_divisor + size_divisor=32, + # Additional paddings for pixel scale + extra_pad_ratio=0.5) + +# -----model related----- +strides = [8, 16, 32] # Strides of multi-scale prior box +num_det_layers = 3 # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) + +# Data augmentation +max_translate_ratio = 0.2 # YOLOv5RandomAffine +scaling_ratio_range = (0.1, 2.0) # YOLOv5RandomAffine +mixup_prob = 0.15 # YOLOv5MixUp +randchoice_mosaic_prob = [0.8, 0.2] +mixup_alpha = 8.0 # YOLOv5MixUp +mixup_beta = 8.0 # YOLOv5MixUp + +# -----train val related----- +loss_cls_weight = 0.3 +loss_bbox_weight = 0.05 +loss_obj_weight = 0.7 +# BatchYOLOv7Assigner params +simota_candidate_topk = 10 +simota_iou_weight = 3.0 +simota_cls_weight = 1.0 +prior_match_thr = 4. # Priori box matching threshold +obj_level_weights = [4., 1., + 0.4] # The obj loss weights of the three output layers + +lr_factor = 0.1 # Learning rate scaling factor +weight_decay = 0.0005 +save_epoch_intervals = 1 # Save model checkpoint and validation intervals +max_keep_ckpts = 3 # The maximum checkpoints to keep. + +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv7Backbone', + arch='L', + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='YOLOv7PAFPN', + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.25, + num_blocks=4, + num_convs_in_block=1), + upsample_feats_cat_first=False, + in_channels=[512, 1024, 1024], + # The real output channel will be multiplied by 2 + out_channels=[128, 256, 512], + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOv7Head', + head_module=dict( + type='YOLOv7HeadModule', + num_classes=num_classes, + in_channels=[256, 512, 1024], + featmap_strides=strides, + num_base_priors=3), + prior_generator=dict( + type='mmdet.YOLOAnchorGenerator', + base_sizes=anchors, + strides=strides), + # scaled based on number of detection layers + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xywh', + reduction='mean', + loss_weight=loss_bbox_weight * (3 / num_det_layers), + return_iou=True), + loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), + prior_match_thr=prior_match_thr, + obj_level_weights=obj_level_weights, + # BatchYOLOv7Assigner params + simota_candidate_topk=simota_candidate_topk, + simota_iou_weight=simota_iou_weight, + simota_cls_weight=simota_cls_weight), + test_cfg=model_test_cfg) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +mosiac4_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # note + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +mosiac9_pipeline = [ + dict( + type='Mosaic9', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # note + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[mosiac4_pipeline, mosiac9_pipeline], + prob=randchoice_mosaic_prob) + +train_pipeline = [ + *pre_transform, + randchoice_mosaic_pipeline, + dict( + type='YOLOv5MixUp', + alpha=mixup_alpha, # note + beta=mixup_beta, # note + prob=mixup_prob, + pre_transform=[*pre_transform, randchoice_mosaic_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='yolov5_collate'), # FASTER + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv7OptimWrapperConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='cosine', + lr_factor=lr_factor, # note + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + save_param_scheduler=False, + interval=save_epoch_intervals, + save_best='auto', + max_keep_ckpts=max_keep_ckpts)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), # Can be accelerated + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - num_epoch_stage2, val_interval_stage2)]) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_tiny_fast_1xb12-40e_cat.py b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_tiny_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..eb0446760eeb39951ad2bf6a8cbb1fe3cc19870a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_tiny_fast_1xb12-40e_cat.py @@ -0,0 +1,56 @@ +_base_ = 'yolov7_tiny_syncbn_fast_8x16b-300e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # The warmup_mim_iter parameter is critical. + # The default value is 1000 which is not suitable for cat datasets. + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b9e9f10e2926a840d2af7a9e27b0e2047710343d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,98 @@ +_base_ = './yolov7_l_syncbn_fast_8x16b-300e_coco.py' + +# ========================modified parameters======================== + +# -----model related----- +# Data augmentation +max_translate_ratio = 0.1 # YOLOv5RandomAffine +scaling_ratio_range = (0.5, 1.6) # YOLOv5RandomAffine +mixup_prob = 0.05 # YOLOv5MixUp +randchoice_mosaic_prob = [0.8, 0.2] +mixup_alpha = 8.0 # YOLOv5MixUp +mixup_beta = 8.0 # YOLOv5MixUp + +# -----train val related----- +loss_cls_weight = 0.5 +loss_obj_weight = 1.0 + +lr_factor = 0.01 # Learning rate scaling factor +# ===============================Unmodified in most cases==================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +model = dict( + backbone=dict( + arch='Tiny', act_cfg=dict(type='LeakyReLU', negative_slope=0.1)), + neck=dict( + is_tiny_version=True, + in_channels=[128, 256, 512], + out_channels=[64, 128, 256], + block_cfg=dict( + _delete_=True, type='TinyDownSampleBlock', middle_ratio=0.25), + act_cfg=dict(type='LeakyReLU', negative_slope=0.1), + use_repconv_outs=False), + bbox_head=dict( + head_module=dict(in_channels=[128, 256, 512]), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +mosiac4_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # change + scaling_ratio_range=scaling_ratio_range, # change + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +mosiac9_pipeline = [ + dict( + type='Mosaic9', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # change + scaling_ratio_range=scaling_ratio_range, # change + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[mosiac4_pipeline, mosiac9_pipeline], + prob=randchoice_mosaic_prob) + +train_pipeline = [ + *pre_transform, + randchoice_mosaic_pipeline, + dict( + type='YOLOv5MixUp', + alpha=mixup_alpha, + beta=mixup_beta, + prob=mixup_prob, # change + pre_transform=[*pre_transform, randchoice_mosaic_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..9758b871785050ef41303082aab745a6568e373b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,182 @@ +_base_ = './yolov7_l_syncbn_fast_8x16b-300e_coco.py' + +# ========================modified parameters======================== +# -----data related----- +img_scale = (1280, 1280) # height, width +num_classes = 80 # Number of classes for classification +# Config of batch shapes. Only on val +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + img_size=img_scale[ + 0], # The image scale of padding should be divided by pad_size_divisor + size_divisor=64) # Additional paddings for pixel scale +tta_img_scales = [(1280, 1280), (1024, 1024), (1536, 1536)] + +# -----model related----- +# Basic size of multi-scale prior box +anchors = [ + [(19, 27), (44, 40), (38, 94)], # P3/8 + [(96, 68), (86, 152), (180, 137)], # P4/16 + [(140, 301), (303, 264), (238, 542)], # P5/32 + [(436, 615), (739, 380), (925, 792)] # P6/64 +] +strides = [8, 16, 32, 64] # Strides of multi-scale prior box +num_det_layers = 4 # # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) + +# Data augmentation +max_translate_ratio = 0.2 # YOLOv5RandomAffine +scaling_ratio_range = (0.1, 2.0) # YOLOv5RandomAffine +mixup_prob = 0.15 # YOLOv5MixUp +randchoice_mosaic_prob = [0.8, 0.2] +mixup_alpha = 8.0 # YOLOv5MixUp +mixup_beta = 8.0 # YOLOv5MixUp + +# -----train val related----- +loss_cls_weight = 0.3 +loss_bbox_weight = 0.05 +loss_obj_weight = 0.7 +obj_level_weights = [4.0, 1.0, 0.25, 0.06] +simota_candidate_topk = 20 + +# The only difference between P6 and P5 in terms of +# hyperparameters is lr_factor +lr_factor = 0.2 + +# ===============================Unmodified in most cases==================== +pre_transform = _base_.pre_transform + +model = dict( + backbone=dict(arch='W', out_indices=(2, 3, 4, 5)), + neck=dict( + in_channels=[256, 512, 768, 1024], + out_channels=[128, 256, 384, 512], + use_maxpool_in_downsample=False, + use_repconv_outs=False), + bbox_head=dict( + head_module=dict( + type='YOLOv7p6HeadModule', + in_channels=[128, 256, 384, 512], + featmap_strides=strides, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + prior_generator=dict(base_sizes=anchors, strides=strides), + simota_candidate_topk=simota_candidate_topk, # note + # scaled based on number of detection layers + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_bbox=dict(loss_weight=loss_bbox_weight * (3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), + obj_level_weights=obj_level_weights)) + +mosiac4_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # note + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +mosiac9_pipeline = [ + dict( + type='Mosaic9', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # note + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[mosiac4_pipeline, mosiac9_pipeline], + prob=randchoice_mosaic_prob) + +train_pipeline = [ + *pre_transform, + randchoice_mosaic_pipeline, + dict( + type='YOLOv5MixUp', + alpha=mixup_alpha, # note + beta=mixup_beta, # note + prob=mixup_prob, + pre_transform=[*pre_transform, randchoice_mosaic_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=batch_shapes_cfg)) +test_dataloader = val_dataloader + +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) + +# Config for Test Time Augmentation. (TTA) +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in tta_img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..9929705962c918392af12dd0a8275321f89fd361 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = './yolov7_l_syncbn_fast_8x16b-300e_coco.py' + +model = dict( + backbone=dict(arch='X'), + neck=dict( + in_channels=[640, 1280, 1280], + out_channels=[160, 320, 640], + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2), + use_repconv_outs=False), + bbox_head=dict(head_module=dict(in_channels=[320, 640, 1280]))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/README.md b/models/YOLO-World/third_party/mmyolo/configs/yolov8/README.md new file mode 100644 index 0000000000000000000000000000000000000000..766aa99163c97bff5206724febd41c3e484faa55 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/README.md @@ -0,0 +1,45 @@ +# YOLOv8 + + + +## Abstract + +Ultralytics YOLOv8, developed by Ultralytics, is a cutting-edge, state-of-the-art (SOTA) model that builds upon the success of previous YOLO versions and introduces new features and improvements to further boost performance and flexibility. YOLOv8 is designed to be fast, accurate, and easy to use, making it an excellent choice for a wide range of object detection, image segmentation and image classification tasks. + +
+ +YOLOv8 performance +
+ +
+ +YOLOv8-P5 model structure +
+ +## Results and models + +### COCO + +| Backbone | Arch | size | Mask Refine | SyncBN | AMP | Mem (GB) | box AP | TTA box AP | Config | Download | +| :------: | :--: | :--: | :---------: | :----: | :-: | :------: | :---------: | :--------: | :-------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv8-n | P5 | 640 | No | Yes | Yes | 2.8 | 37.2 | | [config](./yolov8_n_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804-88c11cdb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804.log.json) | +| YOLOv8-n | P5 | 640 | Yes | Yes | Yes | 2.5 | 37.4 (+0.2) | 39.9 | [config](./yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206-b975b1cd.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206.log.json) | +| YOLOv8-s | P5 | 640 | No | Yes | Yes | 4.0 | 44.2 | | [config](./yolov8_s_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101.log.json) | +| YOLOv8-s | P5 | 640 | Yes | Yes | Yes | 4.0 | 45.1 (+0.9) | 46.8 | [config](./yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938-ce3c1b3f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938.log.json) | +| YOLOv8-m | P5 | 640 | No | Yes | Yes | 7.2 | 49.8 | | [config](./yolov8_m_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200-c22e560a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200.log.json) | +| YOLOv8-m | P5 | 640 | Yes | Yes | Yes | 7.0 | 50.6 (+0.8) | 52.3 | [config](./yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400-f40abfcd.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400.log.json) | +| YOLOv8-l | P5 | 640 | No | Yes | Yes | 9.8 | 52.1 | | [config](./yolov8_l_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526-189611b6.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526.log.json) | +| YOLOv8-l | P5 | 640 | Yes | Yes | Yes | 9.1 | 53.0 (+0.9) | 54.4 | [config](./yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100-5881dec4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100.log.json) | +| YOLOv8-x | P5 | 640 | No | Yes | Yes | 12.2 | 52.7 | | [config](./yolov8_x_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338-5674673c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338.log.json) | +| YOLOv8-x | P5 | 640 | Yes | Yes | Yes | 12.4 | 54.0 (+1.3) | 55.0 | [config](./yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411-079ca8d1.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411.log.json) | + +**Note** + +1. We use 8x A100 for training, and the single-GPU batch size is 16. This is different from the official code, but has no effect on performance. +2. The performance is unstable and may fluctuate by about 0.3 mAP and the highest performance weight in `COCO` training in `YOLOv8` may not be the last epoch. The performance shown above is the best model. +3. We provide [scripts](https://github.com/open-mmlab/mmyolo/tree/dev/tools/model_converters/yolov8_to_mmyolo.py) to convert official weights to MMYOLO. +4. `SyncBN` means using SyncBN, `AMP` indicates training with mixed precision. +5. The performance of `Mask Refine` training is for the weight performance officially released by YOLOv8. `Mask Refine` means refining bbox by mask while loading annotations and transforming after `YOLOv5RandomAffine`, and the L and X models use `Copy Paste`. +6. `TTA` means that Test Time Augmentation. It's perform 3 multi-scaling transformations on the image, followed by 2 flipping transformations (flipping and not flipping). You only need to specify `--tta` when testing to enable. see [TTA](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/common_usage/tta.md) for details. + +## Citation diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/metafile.yml b/models/YOLO-World/third_party/mmyolo/configs/yolov8/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..33cd22bc69114f39c4b2a1fcaeabf5228534bb68 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/metafile.yml @@ -0,0 +1,140 @@ +Collections: + - Name: YOLOv8 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - CSPDarkNet + - PAFPN + - Decoupled Head + README: configs/yolov8/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.0.1/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.0.1 + +Models: + - Name: yolov8_n_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 2.8 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804-88c11cdb.pth + - Name: yolov8_s_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 4.0 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth + - Name: yolov8_m_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 7.2 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.8 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200-c22e560a.pth + - Name: yolov8_l_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 9.8 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526-189611b6.pth + - Name: yolov8_x_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338-5674673c.pth + - Name: yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 2.5 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206-b975b1cd.pth + - Name: yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 4.0 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938-ce3c1b3f.pth + - Name: yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 7.0 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.6 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400-f40abfcd.pth + - Name: yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 9.1 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 53.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100-5881dec4.pth + - Name: yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 12.4 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 54.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411-079ca8d1.pth diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..e25b6bcb63d1bad084f7c2175a6983dadb591fc4 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,65 @@ +_base_ = './yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 1.00 +widen_factor = 1.00 +last_stage_out_channels = 512 + +mixup_prob = 0.15 +copypaste_prob = 0.3 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform +affine_scale = _base_.affine_scale + +model = dict( + backbone=dict( + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels]), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels]))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..bea8b2d56fecd46beddd0370732e8b83309528e5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,39 @@ +_base_ = './yolov8_m_syncbn_fast_8xb16-500e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.00 +widen_factor = 1.00 +last_stage_out_channels = 512 + +mixup_prob = 0.15 + +# =======================Unmodified in most cases================== +pre_transform = _base_.pre_transform +mosaic_affine_transform = _base_.mosaic_affine_transform +last_transform = _base_.last_transform + +model = dict( + backbone=dict( + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels]), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels]))) + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..2884daeb436e321c2c256687e0f063780d680f37 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,85 @@ +_base_ = './yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +last_stage_out_channels = 768 + +affine_scale = 0.9 +mixup_prob = 0.1 +copypaste_prob = 0.1 + +# ===============================Unmodified in most cases==================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform + +model = dict( + backbone=dict( + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels]), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels]))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..840d32ccff78db31d9945bfe32531c1970845ee7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,76 @@ +_base_ = './yolov8_s_syncbn_fast_8xb16-500e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +last_stage_out_channels = 768 + +affine_scale = 0.9 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform + +model = dict( + backbone=dict( + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels]), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels]))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=100, + border_val=(114, 114, 114)), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..50d3774267fd89b747574f72b34e6d7d2237c5ef --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..5833df3a157151bca2d2ce29380962e43f1ec876 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,9 @@ +_base_ = './yolov8_s_syncbn_fast_8xb16-500e_coco.py' + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_fast_1xb12-40e_cat.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..e54bff03358c4138ea175187f6617735e80f185e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_fast_1xb12-40e_cat.py @@ -0,0 +1,52 @@ +_base_ = 'yolov8_s_syncbn_fast_8xb16-500e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +close_mosaic_epochs = 5 + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict(assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu +_base_.custom_hooks[1].switch_epoch = max_epochs - close_mosaic_epochs + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # The warmup_mim_iter parameter is critical. + # The default value is 1000 which is not suitable for cat datasets. + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..769a698e4b52886797e08169cdc6da8eedea204d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,83 @@ +_base_ = './yolov8_s_syncbn_fast_8xb16-500e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +use_mask2refine = True +min_area_ratio = 0.01 # YOLOv5RandomAffine + +# ===============================Unmodified in most cases==================== +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=use_mask2refine) +] + +last_transform = [ + # Delete gt_masks to avoid more computation + dict(type='RemoveDataElement', keys=['gt_masks']), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..7e4127efbfd549803d8794b0bdf9fbcc9565e55c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,334 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 16 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=64 bs +base_lr = 0.01 +max_epochs = 500 # Maximum training epochs +# Disable mosaic augmentation for final 10 epochs (stage 2) +close_mosaic_epochs = 10 + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.7), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# We tested YOLOv8-m will get 0.02 higher than not using it. +batch_shapes_cfg = None +# You can turn on `batch_shapes_cfg` by uncommenting the following lines. +# batch_shapes_cfg = dict( +# type='BatchShapePolicy', +# batch_size=val_batch_size_per_gpu, +# img_size=img_scale[0], +# # The image scale of padding should be divided by pad_size_divisor +# size_divisor=32, +# # Additional paddings for pixel scale +# extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 +# Strides of multi-scale prior box +strides = [8, 16, 32] +# The output channel of the last stage +last_stage_out_channels = 1024 +num_det_layers = 3 # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) # Normalization config + +# -----train val related----- +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +# YOLOv5RandomAffine aspect ratio of width and height thres to filter bboxes +max_aspect_ratio = 100 +tal_topk = 10 # Number of bbox selected in each level +tal_alpha = 0.5 # A Hyper-parameter related to alignment_metrics +tal_beta = 6.0 # A Hyper-parameter related to alignment_metrics +# TODO: Automatically scale loss_weight based on number of detection layers +loss_cls_weight = 0.5 +loss_bbox_weight = 7.5 +# Since the dfloss is implemented differently in the official +# and mmdet, we're going to divide loss_weight by 4. +loss_dfl_weight = 1.5 / 4 +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.0005 +# Save model checkpoint and validation intervals in stage 1 +save_epoch_intervals = 10 +# validation intervals in stage 2 +val_interval_stage2 = 1 +# The maximum checkpoints to keep. +max_keep_ckpts = 2 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv8CSPDarknet', + arch='P5', + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='YOLOv8PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels], + num_csp_blocks=3, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOv8Head', + head_module=dict( + type='YOLOv8HeadModule', + num_classes=num_classes, + in_channels=[256, 512, last_stage_out_channels], + widen_factor=widen_factor, + reg_max=16, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + featmap_strides=strides), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0.5, strides=strides), + bbox_coder=dict(type='DistancePointBBoxCoder'), + # scaled based on number of detection layers + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none', + loss_weight=loss_cls_weight), + loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xyxy', + reduction='sum', + loss_weight=loss_bbox_weight, + return_iou=False), + loss_dfl=dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=loss_dfl_weight)), + train_cfg=dict( + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + use_ciou=True, + topk=tal_topk, + alpha=tal_alpha, + beta=tal_beta, + eps=1e-9)), + test_cfg=model_test_cfg) + +albu_train_transforms = [ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) +] + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +last_transform = [ + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=max_aspect_ratio, + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=max_aspect_ratio, + border_val=(114, 114, 114)), *last_transform +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='yolov5_collate'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + clip_grad=dict(max_norm=10.0), + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='linear', + lr_factor=lr_factor, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + save_best='auto', + max_keep_ckpts=max_keep_ckpts)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + val_interval_stage2)]) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8c27b9619d288f222ea0ce351f9e4578c31934a7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,13 @@ +_base_ = './yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +deepen_factor = 1.00 +widen_factor = 1.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3d8e6653278db54745aa3a3a606bc63aa40328b7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,9 @@ +_base_ = './yolov8_l_syncbn_fast_8xb16-500e_coco.py' + +deepen_factor = 1.00 +widen_factor = 1.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/README.md b/models/YOLO-World/third_party/mmyolo/configs/yolox/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7d5dc683c1b2e912ee27c7492bf7f869c103bb15 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/README.md @@ -0,0 +1,86 @@ +# YOLOX + +> [YOLOX: Exceeding YOLO Series in 2021](https://arxiv.org/abs/2107.08430) + + + +## Abstract + +In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3% AP on COCO, surpassing NanoDet by 1.8% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3% AP on COCO, outperforming the current best practice by 3.0% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported. + +
+ +
+ +
+ +YOLOX-l model structure +
+ +## 🥳 🚀 Results and Models + +| Backbone | Size | Batch Size | AMP | RTMDet-Hyp | Mem (GB) | Box AP | Config | Download | +| :--------: | :--: | :--------: | :-: | :--------: | :------: | :---------: | :-------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOX-tiny | 416 | 8xb8 | No | No | 2.8 | 32.7 | [config](./yolox_tiny_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908-0e40a6fc.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908.log.json) | +| YOLOX-tiny | 416 | 8xb32 | Yes | Yes | 4.9 | 34.3 (+1.6) | [config](./yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637.log.json) | +| YOLOX-s | 640 | 8xb8 | Yes | No | 2.9 | 40.7 | [config](./yolox_s_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600-2b224d8b.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600.log.json) | +| YOLOX-s | 640 | 8xb32 | Yes | Yes | 9.8 | 41.9 (+1.2) | [config](./yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645.log.json) | +| YOLOX-m | 640 | 8xb8 | Yes | No | 4.9 | 46.9 | [config](./yolox_m_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218-a71a6b25.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218.log.json) | +| YOLOX-m | 640 | 8xb32 | Yes | Yes | 17.6 | 47.5 (+0.6) | [config](./yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328.log.json) | +| YOLOX-l | 640 | 8xb8 | Yes | No | 8.0 | 50.1 | [config](./yolox_l_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715.log.json) | +| YOLOX-x | 640 | 8xb8 | Yes | No | 9.8 | 51.4 | [config](./yolox_x_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950-1d509fab.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950.log.json) | + +YOLOX uses a default training configuration of `8xbs8` which results in a long training time, we expect it to use `8xbs32` to speed up the training and not cause a decrease in mAP. We modified `train_batch_size_per_gpu` from 8 to 32, `batch_augments_interval` from 10 to 1 and `base_lr` from 0.01 to 0.04 under YOLOX-s default configuration based on the linear scaling rule, which resulted in mAP degradation. Finally, I found that using RTMDet's training hyperparameter can improve performance in YOLOX Tiny/S/M, which also validates the superiority of RTMDet's training hyperparameter. + +The modified training parameters are as follows: + +1. train_batch_size_per_gpu: 8 -> 32 +2. batch_augments_interval: 10 -> 1 +3. num_last_epochs: 15 -> 20 +4. optim cfg: SGD -> AdamW, base_lr 0.01 -> 0.004, weight_decay 0.0005 -> 0.05 +5. ema momentum: 0.0001 -> 0.0002 + +**Note**: + +1. The test score threshold is 0.001. +2. Due to the need for pre-training weights, we cannot reproduce the performance of the `yolox-nano` model. Please refer to https://github.com/Megvii-BaseDetection/YOLOX/issues/674 for more information. + +## YOLOX-Pose + +Based on [MMPose](https://github.com/open-mmlab/mmpose/blob/main/projects/yolox-pose/README.md), we have implemented a YOLOX-based human pose estimator, utilizing the approach outlined in **YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss (CVPRW 2022)**. This pose estimator is lightweight and quick, making it well-suited for crowded scenes. + +
+ +
+ +### Results + +| Backbone | Size | Batch Size | AMP | RTMDet-Hyp | Mem (GB) | AP | Config | Download | +| :--------: | :--: | :--------: | :-: | :--------: | :------: | :--: | :------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOX-tiny | 416 | 8xb32 | Yes | Yes | 5.3 | 52.8 | [config](./pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco_20230427_080351-2117af67.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco_20230427_080351.log.json) | +| YOLOX-s | 640 | 8xb32 | Yes | Yes | 10.7 | 63.7 | [config](./pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco_20230427_005150-e87d843a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco_20230427_005150.log.json) | +| YOLOX-m | 640 | 8xb32 | Yes | Yes | 19.2 | 69.3 | [config](./pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco_20230427_094024-bbeacc1c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco_20230427_094024.log.json) | +| YOLOX-l | 640 | 8xb32 | Yes | Yes | 30.3 | 71.1 | [config](./pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco_20230427_041140-82d65ac8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco_20230427_041140.log.json) | + +**Note** + +1. The performance is unstable and may fluctuate and the highest performance weight in `COCO` training may not be the last epoch. The performance shown above is the best model. + +### Installation + +Install MMPose + +``` +mim install -r requirements/mmpose.txt +``` + +## Citation + +```latex +@article{yolox2021, + title={{YOLOX}: Exceeding YOLO Series in 2021}, + author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian}, + journal={arXiv preprint arXiv:2107.08430}, + year={2021} +} +``` diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/metafile.yml b/models/YOLO-World/third_party/mmyolo/configs/yolox/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..78ede704a629fa44957bc2b24e05e6559fc17710 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/metafile.yml @@ -0,0 +1,166 @@ +Collections: + - Name: YOLOX + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - Cosine Annealing Lr Updater + Training Resources: 8x A100 GPUs + Architecture: + - CSPDarkNet + - PAFPN + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'YOLOX: Exceeding YOLO Series in 2021' + README: configs/yolox/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.1.0/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.0 + + +Models: + - Name: yolox_tiny_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 2.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 32.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908-0e40a6fc.pth + - Name: yolox_s_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_s_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 2.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600-2b224d8b.pth + - Name: yolox_m_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_m_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 4.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218-a71a6b25.pth + - Name: yolox_l_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_l_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 8.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth + - Name: yolox_x_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_x_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 9.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950-1d509fab.pth + - Name: yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 4.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 34.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth + - Name: yolox_s_fast_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: configs/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 9.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth + - Name: yolox_m_fast_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 17.6 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth + - Name: yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 5.3 + Epochs: 300 + Results: + - Task: Human Pose Estimation + Dataset: COCO + Metrics: + AP: 52.8 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco_20230427_080351-2117af67.pth + - Name: yolox-pose_s_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 300 + Results: + - Task: Human Pose Estimation + Dataset: COCO + Metrics: + AP: 63.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco_20230427_005150-e87d843a.pth + - Name: yolox-pose_m_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 19.2 + Epochs: 300 + Results: + - Task: Human Pose Estimation + Dataset: COCO + Metrics: + AP: 69.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco_20230427_094024-bbeacc1c.pth + - Name: yolox-pose_l_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 30.3 + Epochs: 300 + Results: + - Task: Human Pose Estimation + Dataset: COCO + Metrics: + AP: 71.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco_20230427_041140-82d65ac8.pth diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..96de5e98183b33d6c19865547e7f7e217be31ea5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,14 @@ +_base_ = ['./yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py'] + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 1.0 +widen_factor = 1.0 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f78d6a3a2f8ce2828839073f1fe2582f49bb5a69 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,14 @@ +_base_ = ['./yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py'] + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa2172c989ddfa6c6b28e33654e1c14b8cbbc91 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,136 @@ +_base_ = '../yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth' # noqa + +num_keypoints = 17 +scaling_ratio_range = (0.75, 1.0) +mixup_ratio_range = (0.8, 1.6) +num_last_epochs = 20 + +# model settings +model = dict( + bbox_head=dict( + type='YOLOXPoseHead', + head_module=dict( + type='YOLOXPoseHeadModule', + num_classes=1, + num_keypoints=num_keypoints, + ), + loss_pose=dict( + type='OksLoss', + metainfo='configs/_base_/pose/coco.py', + loss_weight=30.0)), + train_cfg=dict( + assigner=dict( + type='PoseSimOTAAssigner', + center_radius=2.5, + oks_weight=3.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + oks_calculator=dict( + type='OksLoss', metainfo='configs/_base_/pose/coco.py'))), + test_cfg=dict(score_thr=0.01)) + +# pipelines +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_keypoints=True) +] + +img_scale = _base_.img_scale + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='RandomAffine', + scaling_ratio_range=scaling_ratio_range, + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='YOLOXMixUp', + img_scale=img_scale, + ratio_range=mixup_ratio_range, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='FilterAnnotations', by_keypoints=True, keep_empty=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='FilterAnnotations', by_keypoints=True, keep_empty=False), + dict(type='PackDetInputs') +] + +test_pipeline = [ + *pre_transform, + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict( + type='PackDetInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip_indices')) +] + +# dataset settings +dataset_type = 'PoseCocoDataset' + +train_dataloader = dict( + dataset=dict( + type=dataset_type, + data_mode='bottomup', + ann_file='annotations/person_keypoints_train2017.json', + pipeline=train_pipeline_stage1)) + +val_dataloader = dict( + dataset=dict( + type=dataset_type, + data_mode='bottomup', + ann_file='annotations/person_keypoints_val2017.json', + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + _delete_=True, + type='mmpose.CocoMetric', + ann_file=_base_.data_root + 'annotations/person_keypoints_val2017.json', + score_mode='bbox') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +visualizer = dict(type='mmpose.PoseLocalVisualizer') + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict(type='mmdet.SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..a7399065e70f40f4142abc943b572cbd93954222 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,70 @@ +_base_ = './yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth' # noqa + +deepen_factor = 0.33 +widen_factor = 0.375 +scaling_ratio_range = (0.75, 1.0) + +# model settings +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=1) + ]), + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +# data settings +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='RandomAffine', + scaling_ratio_range=scaling_ratio_range, + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='FilterAnnotations', + by_keypoints=True, + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +test_pipeline = [ + *pre_transform, + dict(type='Resize', scale=(416, 416), keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict( + type='PackDetInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip_indices')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline_stage1)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_l_fast_8xb8-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_l_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..39198d2e245b00445f0a5d38e41a1ffe389b17de --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_l_fast_8xb8-300e_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.0 +widen_factor = 1.0 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..4a4743c2dd4bcbe9e692aff54e3af1909d540c60 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb8-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ec8fd2c854bc2d41d53ba481fa3ad7f23ba3c54a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb8-300e_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..851664fb3cb03dc24c4ea03e158b08db011684e9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 +use_depthwise = True + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + use_depthwise=use_depthwise), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + use_depthwise=use_depthwise), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, use_depthwise=use_depthwise))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb8-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..a0a5d373856343af82259f9c165f851be49de16d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb8-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolox_tiny_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 +use_depthwise = True + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + use_depthwise=use_depthwise), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + use_depthwise=use_depthwise), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, use_depthwise=use_depthwise))) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_p5_tta.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_p5_tta.py new file mode 100644 index 0000000000000000000000000000000000000000..7ffe3490ca3f7f059d498201277f4df86fbcd3da --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_p5_tta.py @@ -0,0 +1,56 @@ +# TODO: Need to solve the problem of multiple backend_args parameters +# _backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) + +_backend_args = None + +tta_model = dict( + type='mmdet.DetTTAModel', + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=300)) + +img_scales = [(640, 640), (320, 320), (960, 960)] + +# LoadImageFromFile +# / | \ +# Resize Resize Resize # noqa +# / \ / \ / \ +# RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip # noqa +# | | | | | | +# LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn +# | | | | | | +# PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn # noqa + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_backend_args), + dict( + type='TestTimeAug', + transforms=[ + [ + dict(type='mmdet.Resize', scale=s, keep_ratio=True) + for s in img_scales + ], + [ + # ``RandomFlip`` must be placed before ``Pad``, otherwise + # bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], + [ + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + ], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_s_fast_1xb12-40e-rtmdet-hyp_cat.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_s_fast_1xb12-40e-rtmdet-hyp_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..f7eac58fb548a034e22acccef72a32951bb80dee --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_s_fast_1xb12-40e-rtmdet-hyp_cat.py @@ -0,0 +1,76 @@ +_base_ = './yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +num_last_epochs = 5 + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +param_scheduler = [ + dict( + # use quadratic formula to warm up 3 epochs + # and lr is updated by iteration + # TODO: fix default scope in get function + type='mmdet.QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=3, + convert_to_iter_based=True), + dict( + # use cosine lr from 5 to 35 epoch + type='CosineAnnealingLR', + eta_min=_base_.base_lr * 0.05, + begin=5, + T_max=max_epochs - num_last_epochs, + end=max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True), + dict( + # use fixed lr during last num_last_epochs epochs + type='ConstantLR', + by_epoch=True, + factor=1, + begin=max_epochs - num_last_epochs, + end=max_epochs, + ) +] + +_base_.custom_hooks[0].num_last_epochs = num_last_epochs + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..167023da94815e13a782b85209e1116aeac7803d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,87 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +# Batch size of a single GPU during training +# 8 -> 32 +train_batch_size_per_gpu = 32 + +# Multi-scale training intervals +# 10 -> 1 +batch_augments_interval = 1 + +# Last epoch number to switch training pipeline +# 15 -> 20 +num_last_epochs = 20 + +# Base learning rate for optim_wrapper. Corresponding to 8xb32=256 bs +base_lr = 0.004 + +# SGD -> AdamW +optim_wrapper = dict( + _delete_=True, + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# 0.0001 -> 0.0002 +ema_momentum = 0.0002 + +# ============================== Unmodified in most cases =================== +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=batch_augments_interval) + ])) + +param_scheduler = [ + dict( + # use quadratic formula to warm up 5 epochs + # and lr is updated by iteration + # TODO: fix default scope in get function + type='mmdet.QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + # use cosine lr from 5 to 285 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=5, + T_max=_base_.max_epochs - num_last_epochs, + end=_base_.max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True), + dict( + # use fixed lr during last num_last_epochs epochs + type='ConstantLR', + by_epoch=True, + factor=1, + begin=_base_.max_epochs - num_last_epochs, + end=_base_.max_epochs, + ) +] + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + new_train_pipeline=_base_.train_pipeline_stage2, + priority=48), + dict(type='mmdet.SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=ema_momentum, + update_buffers=True, + strict_load=False, + priority=49) +] + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) +train_cfg = dict(dynamic_intervals=[(_base_.max_epochs - num_last_epochs, 1)]) +auto_scale_lr = dict(base_batch_size=8 * train_batch_size_per_gpu) diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb8-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b371ea11d2dd0900476d88a9de626e881297d790 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb8-300e_coco.py @@ -0,0 +1,331 @@ +_base_ = ['../_base_/default_runtime.py', 'yolox_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of train image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 +# Worker to pre-fetch data for each single GPU during tarining +train_num_workers = 8 +# Presistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=64 bs +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs + +model_test_cfg = dict( + yolox_style=True, # better + # The config of multi-label for multi-class prediction + multi_label=True, # 40.5 -> 40.7 + score_thr=0.001, # Threshold to filter out boxes + max_per_img=300, # Max number of detections of each image + nms=dict(type='nms', iou_threshold=0.65)) # NMS type and threshold + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) +# generate new random resize shape interval +batch_augments_interval = 10 + +# -----train val related----- +weight_decay = 0.0005 +loss_cls_weight = 1.0 +loss_bbox_weight = 5.0 +loss_obj_weight = 1.0 +loss_bbox_aux_weight = 1.0 +center_radius = 2.5 # SimOTAAssigner +num_last_epochs = 15 +random_affine_scaling_ratio_range = (0.1, 2) +mixup_ratio_range = (0.8, 1.6) +# Save model checkpoint and validation intervals +save_epoch_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 + +ema_momentum = 0.0001 + +# ===============================Unmodified in most cases==================== +# model settings +model = dict( + type='YOLODetector', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, # math.sqrt(5) + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + # TODO: Waiting for mmengine support + use_syncbn=False, + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=batch_augments_interval) + ]), + backbone=dict( + type='YOLOXCSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + ), + neck=dict( + type='YOLOXPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=256, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOXHead', + head_module=dict( + type='YOLOXHeadModule', + num_classes=num_classes, + in_channels=256, + feat_channels=256, + widen_factor=widen_factor, + stacked_convs=2, + featmap_strides=(8, 16, 32), + use_depthwise=False, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + ), + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=loss_cls_weight), + loss_bbox=dict( + type='mmdet.IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=loss_bbox_weight), + loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=loss_obj_weight), + loss_bbox_aux=dict( + type='mmdet.L1Loss', + reduction='sum', + loss_weight=loss_bbox_aux_weight)), + train_cfg=dict( + assigner=dict( + type='mmdet.SimOTAAssigner', + center_radius=center_radius, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'))), + test_cfg=model_test_cfg) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=random_affine_scaling_ratio_range, + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='YOLOXMixUp', + img_scale=img_scale, + ratio_range=mixup_ratio_range, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + # If the image is three-channel, the pad value needs + # to be set separately for each channel. + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + collate_fn=dict(type='yolov5_collate'), + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline_stage1)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img=val_data_prefix), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# Reduce evaluation time +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') + +test_evaluator = val_evaluator + +# optimizer +# default 8 gpu +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.9, + weight_decay=weight_decay, + nesterov=True), + paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.)) + +# learning rate +param_scheduler = [ + dict( + # use quadratic formula to warm up 5 epochs + # and lr is updated by iteration + # TODO: fix default scope in get function + type='mmdet.QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + # use cosine lr from 5 to 285 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=5, + T_max=max_epochs - num_last_epochs, + end=max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True), + dict( + # use fixed lr during last 15 epochs + type='ConstantLR', + by_epoch=True, + factor=1, + begin=max_epochs - num_last_epochs, + end=max_epochs, + ) +] + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto')) + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict(type='mmdet.SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=ema_momentum, + update_buffers=True, + strict_load=False, + priority=49) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) + +auto_scale_lr = dict(base_batch_size=8 * train_batch_size_per_gpu) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..28e539c9472d20fe2e28b49659ec523c098bb170 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,70 @@ +_base_ = './yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.375 + +# Multi-scale training intervals +# 10 -> 1 +batch_augments_interval = 1 + +scaling_ratio_range = (0.5, 1.5) + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform + +# model settings +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=batch_augments_interval) + ]), + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=(416, 416), keep_ratio=True), # note + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline_stage1)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..fd175a6c73ccc55df697ccbf04dfb46a3fbdc0ee --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py @@ -0,0 +1,100 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.375 +scaling_ratio_range = (0.5, 1.5) + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform + +test_img_scale = (416, 416) +tta_img_scales = [test_img_scale, (320, 320), (640, 640)] + +# model settings +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=10) + ]), + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=test_img_scale, keep_ratio=True), # note + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline_stage1)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# Config for Test Time Augmentation. (TTA) +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + [ + dict(type='mmdet.Resize', scale=s, keep_ratio=True) + for s in tta_img_scales + ], + [ + # ``RandomFlip`` must be placed before ``Pad``, otherwise + # bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], + [ + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + ], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +] diff --git a/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_x_fast_8xb8-300e_coco.py b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_x_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..0759d468be70f9af026fef2ae0dbf2308082ad96 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/configs/yolox/yolox_x_fast_8xb8-300e_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.33 +widen_factor = 1.25 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/demo/15_minutes_instance_segmentation.ipynb b/models/YOLO-World/third_party/mmyolo/demo/15_minutes_instance_segmentation.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a09a1a10512c15abd611c35cefdfbeda64090268 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/demo/15_minutes_instance_segmentation.ipynb @@ -0,0 +1,658 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "x7seefPduh36" + }, + "source": [ + "
\n", + " \n", + "
 
\n", + "
\n", + " OpenMMLab website\n", + " \n", + " \n", + " HOT\n", + " \n", + " \n", + "     \n", + " OpenMMLab platform\n", + " \n", + " \n", + " TRY IT OUT\n", + " \n", + " \n", + "
\n", + "
 
\n", + "\n", + "\"Open\n", + "\n", + "[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo)\n", + "[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/en/latest/)\n", + "[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions)\n", + "[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo)\n", + "[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE)\n", + "[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n", + "[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n", + "\n", + "[📘Documentation](https://mmyolo.readthedocs.io/en/latest/) |\n", + "[🛠️Installation](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html) |\n", + "[👀Model Zoo](https://mmyolo.readthedocs.io/en/latest/model_zoo.html) |\n", + "[🆕Update News](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html) |\n", + "[🤔Reporting Issues](https://github.com/open-mmlab/mmyolo/issues/new/choose)\n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "V6W8P5XEJGoc" + }, + "source": [ + "# 15 minutes to get started with MMYOLO instance segmentation\n", + "\n", + "Instance segmentation is a task in computer vision that aims to segment each object in an image and assign each object a unique identifier.\n", + "\n", + "Unlike semantic segmentation, instance segmentation not only segments out different categories in an image, but also separates different instances of the same category.\n", + "\n", + "
\n", + "\"Instance\n", + "
\n", + "\n", + "Taking the downloadable balloon dataset as an example, I will guide you through a 15-minute easy introduction to MMYOLO instance segmentation. The entire process includes the following steps:\n", + "\n", + "- [Installation](#installation)\n", + "- [Dataset](#dataset)\n", + "- [Config](#config)\n", + "- [Training](#training)\n", + "- [Testing](#testing)\n", + "- [EasyDeploy](#easydeploy-deployment)\n", + "\n", + "In this tutorial, we will use YOLOv5-s as an example. For the demo configuration of the balloon dataset with other YOLO series algorithms, please refer to the corresponding algorithm configuration folder." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Ae5SqsA7wYGQ" + }, + "source": [ + "## Installation\n", + "\n", + "Assuming you've already installed Conda in advance, then install PyTorch using the following commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XVLRaEIzwW-6", + "outputId": "901b5db6-b1d7-4830-e746-485ee76d6648" + }, + "outputs": [], + "source": [ + "# -----------------------------------------------------------------------------------------\n", + "# If you are using colab, you can skip this cell for PyTorch is pre-installed on the colab.\n", + "# -----------------------------------------------------------------------------------------\n", + "!python -V\n", + "# Check nvcc version\n", + "!nvcc -V\n", + "# Check GCC version\n", + "!gcc --version\n", + "# Create a new Conda environment\n", + "%conda create -n mmyolo python=3.8 -y\n", + "%conda activate mmyolo\n", + "# If you have GPU\n", + "%conda install pytorch torchvision -c pytorch\n", + "# If you only have CPU\n", + "# %conda install pytorch torchvision cpuonly -c pytorch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check PyTorch version\n", + "import torch\n", + "print(torch.__version__)\n", + "print(torch.cuda.is_available())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install MMYOLO and dependency libraries using the following commands.\n", + "For details about how to configure the environment, see [Installation and verification](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html).\n", + "```{note}\n", + "Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-qATUuntwmfD", + "outputId": "24be577b-efce-46f2-8b2f-a65d02824467" + }, + "outputs": [], + "source": [ + "!git clone https://github.com/open-mmlab/mmyolo.git\n", + "%cd mmyolo\n", + "%pip install -U openmim\n", + "!mim install -r requirements/mminstall.txt\n", + "# Install albumentations\n", + "!mim install -r requirements/albu.txt\n", + "# Install MMYOLO\n", + "!mim install -v -e .\n", + "# \"-v\" means verbose, or more output\n", + "# \"-e\" means installing a project in editable mode,\n", + "# thus any local modifications made to the code will take effect without reinstallation." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "The Balloon dataset is a single-class dataset that consists of 74 images and includes annotated information required for training. Here is an example image from the dataset:\n", + "\n", + "
\n", + "\"balloon\n", + "
\n", + "\n", + "You can download and use it directly by the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gMQXwWuIw3ef", + "outputId": "c8efeac7-5b0c-4342-b5af-d3e790e358c3" + }, + "outputs": [], + "source": [ + "!python tools/misc/download_dataset.py --dataset-name balloon --save-dir ./data/balloon --unzip --delete\n", + "!python ./tools/dataset_converters/balloon2coco.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "covQskXXw2ul" + }, + "source": [ + "The data for the MMYOLO project is located in the MMYOLO project directory. The `train.json` and `val.json` files store the annotations in COCO format, while the `data/balloon/train` and `data/balloon/val` directories contain all the images for the dataset.\n", + "\n", + "## Config\n", + "\n", + "Taking YOLOv5 algorithm as an example, considering the limited GPU memory of users, we need to modify some default training parameters to make them run smoothly. The key parameters to be modified are as follows:\n", + "\n", + "- YOLOv5 is an Anchor-Based algorithm, and different datasets need to calculate suitable anchors adaptively.\n", + "- The default config uses 8 GPUs with a batch size of 16 per GPU. Now change it to a single GPU with a batch size of 12.\n", + "- In principle, the learning rate should be linearly scaled accordingly when the batch size is changed, but actual measurements have found that this is not necessary.\n", + "\n", + "To perform the specific operation, create a new configuration file named `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` in the `configs/yolov5/ins_seg` folder. For convenience, we have already provided this configuration file. Copy the following contents into the configuration file.\n", + "\n", + "```python\n", + "_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa\n", + "\n", + "data_root = 'data/balloon/' # dataset root\n", + "# Training set annotation file of json path\n", + "train_ann_file = 'train.json'\n", + "train_data_prefix = 'train/' # Dataset prefix\n", + "# Validation set annotation file of json path\n", + "val_ann_file = 'val.json'\n", + "val_data_prefix = 'val/'\n", + "metainfo = {\n", + " 'classes': ('balloon', ), # dataset category name\n", + " 'palette': [\n", + " (220, 20, 60),\n", + " ]\n", + "}\n", + "num_classes = 1\n", + "# Set batch size to 4\n", + "train_batch_size_per_gpu = 4\n", + "# dataloader num workers\n", + "train_num_workers = 2\n", + "log_interval = 1\n", + "#####################\n", + "train_dataloader = dict(\n", + " batch_size=train_batch_size_per_gpu,\n", + " num_workers=train_num_workers,\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " metainfo=metainfo,\n", + " data_prefix=dict(img=train_data_prefix),\n", + " ann_file=train_ann_file))\n", + "val_dataloader = dict(\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " metainfo=metainfo,\n", + " data_prefix=dict(img=val_data_prefix),\n", + " ann_file=val_ann_file))\n", + "test_dataloader = val_dataloader\n", + "val_evaluator = dict(ann_file=data_root + val_ann_file)\n", + "test_evaluator = val_evaluator\n", + "default_hooks = dict(logger=dict(interval=log_interval))\n", + "#####################\n", + "\n", + "model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes)))\n", + "```\n", + "\n", + "The above configuration inherits from `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py` and updates configurations such as `data_root`, `metainfo`, `train_dataloader`, `val_dataloader`, `num_classes`, etc., based on the characteristics of the balloon dataset.\n", + "\n", + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "TQ0h6sv_rJxq" + }, + "source": [ + "After running the training command mentioned above, the folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance` will be automatically generated. The weight files and the training configuration file for this session will be saved in this folder. On a lower-end GPU like the GTX 1660, the entire training process will take approximately 30 minutes.\n", + "\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The performance on `val.json` is as follows:\n", + "\n", + "```text\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.330\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.509\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.317\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.103\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.417\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.150\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.396\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.454\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.317\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525\n", + "```\n", + "\n", + "The above performance is obtained by printing using the COCO API, where -1 indicates the absence of objects of that scale.\n", + "\n", + "### Some Notes\n", + "\n", + "Two key warnings are printed during training:\n", + "\n", + "- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon.\n", + "\n", + "The warning is because the `num_classes` currently trained is 1, the loss of the classification branch is always 0 according to the community of the YOLOv5 algorithm, which is a normal phenomenon.\n", + "\n", + "### Training is resumed after the interruption\n", + "\n", + "If you stop training, you can add `--resume` to the end of the training command and the program will automatically resume training with the latest weights file from `work_dirs`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --resume" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "3sJxvQoUrMhX" + }, + "source": [ + "### Save GPU memory strategy\n", + "\n", + "The above config requires about 3G RAM, so if you don't have enough, consider turning on mixed-precision training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --amp" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "jVJdyHTxrQ9a" + }, + "source": [ + "### Training visualization\n", + "\n", + "MMYOLO currently supports local, TensorBoard, WandB and other back-end visualization. The default is to use local visualization, and you can switch to WandB and other real-time visualization of various indicators in the training process.\n", + "\n", + "#### 1 WandB\n", + "\n", + "WandB visualization need registered in website, and in the https://wandb.ai/settings for wandb API Keys.\n", + "\n", + "
\n", + "\"image\"/\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install wandb\n", + "# After running wandb login, enter the API Keys obtained above, and the login is successful.\n", + "!wandb login" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Yu0_4YYRrbyY" + }, + "source": [ + "Add the wandb config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`.\n", + "\n", + "```python\n", + "visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')])\n", + "```\n", + "\n", + "Running the training command and you will see the loss, learning rate, and coco/bbox_mAP visualizations in the link." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "f_DyzfDIzwMa" + }, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "#### 2 Tensorboard\n", + "\n", + "Install Tensorboard using the following command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "gHkGlii3n29Q" + }, + "outputs": [], + "source": [ + "%pip install tensorboard" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "bE-nx9TY1P-M" + }, + "source": [ + "Add the `tensorboard` config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`.\n", + "\n", + "```python\n", + "visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')])\n", + "```\n", + "\n", + "After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/vis_data`.\n", + "We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "g8fZgokho5CE" + }, + "outputs": [], + "source": [ + "!tensorboard --logdir=work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "GUZ7MPoaro-o" + }, + "source": [ + "## Testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VYmxtE0GunTB", + "outputId": "f440807c-1931-4810-b76d-617f73fde227" + }, + "outputs": [], + "source": [ + "!python tools/test.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance best_coco_bbox_mAP_epoch_300.pth --show-dir show_results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "_cFocUqN0BCb" + }, + "source": [ + "Run the above test command, you can not only get the AP performance printed in the **Training** section, You can also automatically save the result images to the `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/show_results` folder. Below is one of the result images, the left image is the actual annotation, and the right image is the inference result of the model.\n", + "\n", + "
\n", + "\"result_img\"/\n", + "
\n", + "\n", + "You can also visualize model inference results in a browser window if you use `WandbVisBackend` or `TensorboardVisBackend`.\n", + "\n", + "## Feature map visualization\n", + "\n", + "MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md)\n", + "\n", + "Due to the bias of direct visualization of `test_pipeline`, we need to modify the `test_pipeline` of `configs/yolov5/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`\n", + "\n", + "```python\n", + "test_pipeline = [\n", + " dict(\n", + " type='LoadImageFromFile',\n", + " file_client_args=_base_.file_client_args),\n", + " dict(type='YOLOv5KeepRatioResize', scale=img_scale),\n", + " dict(\n", + " type='LetterResize',\n", + " scale=img_scale,\n", + " allow_scale_up=False,\n", + " pad_val=dict(img=114)),\n", + " dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n", + " dict(\n", + " type='mmdet.PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor', 'pad_param'))\n", + "]\n", + "```\n", + "\n", + "to the following config:\n", + "\n", + "```python\n", + "test_pipeline = [\n", + " dict(\n", + " type='LoadImageFromFile',\n", + " file_client_args=_base_.file_client_args),\n", + " dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize\n", + " dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n", + " dict(\n", + " type='mmdet.PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + "]\n", + "```\n", + "\n", + "Let's choose the `data/balloon/train/3927754171_9011487133_b.jpg` image as an example to visualize the output feature maps of YOLOv5 backbone and neck layers.\n", + "\n", + "**1. Visualize the three channels of YOLOv5 backbone**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg onfigs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth --target-layers backbone --channel-reduction squeeze_mean" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The result will be saved to the output folder in current path. Three output feature maps plotted in the above figure correspond to small, medium and large output feature maps.\n", + "\n", + "**2. Visualize the three channels of YOLOv5 neck**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \\\n", + " configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \\\n", + " work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \\\n", + " --target-layers neck \\\n", + " --channel-reduction squeeze_mean" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "**3. Grad-Based CAM visualization**\n", + "TODO" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EasyDeploy deployment\n", + "TODO\n", + "\n", + "This completes the transformation deployment of the trained model and checks the inference results. This is the end of the tutorial.\n", + "\n", + "If you encounter problems during training or testing, please check the [common troubleshooting steps](https://mmyolo.readthedocs.io/en/dev/recommended_topics/troubleshooting_steps.html) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it." + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [], + "toc_visible": true + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/models/YOLO-World/third_party/mmyolo/demo/15_minutes_object_detection.ipynb b/models/YOLO-World/third_party/mmyolo/demo/15_minutes_object_detection.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..47e0ccbd803c808982b2a30d55b640f0b1bd48da --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/demo/15_minutes_object_detection.ipynb @@ -0,0 +1,1002 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "x7seefPduh36" + }, + "source": [ + "
\n", + " \n", + "
 
\n", + "
\n", + " OpenMMLab website\n", + " \n", + " \n", + " HOT\n", + " \n", + " \n", + "     \n", + " OpenMMLab platform\n", + " \n", + " \n", + " TRY IT OUT\n", + " \n", + " \n", + "
\n", + "
 
\n", + "\n", + "\"Open\n", + "\n", + "[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo)\n", + "[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/en/latest/)\n", + "[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions)\n", + "[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo)\n", + "[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE)\n", + "[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n", + "[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n", + "\n", + "[📘Documentation](https://mmyolo.readthedocs.io/en/latest/) |\n", + "[🛠️Installation](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html) |\n", + "[👀Model Zoo](https://mmyolo.readthedocs.io/en/latest/model_zoo.html) |\n", + "[🆕Update News](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html) |\n", + "[🤔Reporting Issues](https://github.com/open-mmlab/mmyolo/issues/new/choose)\n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "V6W8P5XEJGoc" + }, + "source": [ + "# 15 minutes to get started with MMYOLO object detection\n", + "\n", + "Object detection task refers to that given a picture, the network predicts all the categories of objects included in the picture and the corresponding boundary boxes\n", + "\n", + "
\n", + "\"object\n", + "
\n", + "\n", + "Take the small dataset of cat as an example, you can easily learn MMYOLO object detection in 15 minutes. The whole process consists of the following steps:\n", + "\n", + "- [Installation](#installation)\n", + "- [Dataset](#dataset)\n", + "- [Config](#config)\n", + "- [Training](#training)\n", + "- [Testing](#testing)\n", + "- [EasyDeploy](#easydeploy-deployment)\n", + "\n", + "In this tutorial, we take YOLOv5-s as an example. For the rest of the YOLO series algorithms, please see the corresponding algorithm configuration folder." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Ae5SqsA7wYGQ" + }, + "source": [ + "## Installation\n", + "\n", + "Assuming you've already installed Conda in advance, then install PyTorch using the following commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XVLRaEIzwW-6", + "outputId": "901b5db6-b1d7-4830-e746-485ee76d6648" + }, + "outputs": [], + "source": [ + "# -----------------------------------------------------------------------------------------\n", + "# If you are using colab, you can skip this cell for PyTorch is pre-installed on the colab.\n", + "# -----------------------------------------------------------------------------------------\n", + "!python -V\n", + "# Check nvcc version\n", + "!nvcc -V\n", + "# Check GCC version\n", + "!gcc --version\n", + "# Create a new Conda environment\n", + "%conda create -n mmyolo python=3.8 -y\n", + "%conda activate mmyolo\n", + "# If you have GPU\n", + "%conda install pytorch torchvision -c pytorch\n", + "# If you only have CPU\n", + "# %conda install pytorch torchvision cpuonly -c pytorch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check PyTorch version\n", + "import torch\n", + "print(torch.__version__)\n", + "print(torch.cuda.is_available())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install MMYOLO and dependency libraries using the following commands.\n", + "For details about how to configure the environment, see [Installation and verification](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html).\n", + "```{note}\n", + "Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-qATUuntwmfD", + "outputId": "24be577b-efce-46f2-8b2f-a65d02824467" + }, + "outputs": [], + "source": [ + "!git clone https://github.com/open-mmlab/mmyolo.git\n", + "%cd mmyolo\n", + "%pip install -U openmim\n", + "!mim install -r requirements/mminstall.txt\n", + "# Install albumentations\n", + "!mim install -r requirements/albu.txt\n", + "# Install MMYOLO\n", + "!mim install -v -e .\n", + "# \"-v\" means verbose, or more output\n", + "# \"-e\" means installing a project in editable mode,\n", + "# thus any local modifications made to the code will take effect without reinstallation." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "The Cat dataset is a single-category dataset consisting of 144 pictures (the original pictures are provided by @RangeKing, and cleaned by @PeterH0323), which contains the annotation information required for training. The sample image is shown below:\n", + "\n", + "
\n", + "\"cat\n", + "
\n", + "\n", + "You can download and use it directly by the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gMQXwWuIw3ef", + "outputId": "c8efeac7-5b0c-4342-b5af-d3e790e358c3" + }, + "outputs": [], + "source": [ + "!python tools/misc/download_dataset.py --dataset-name cat --save-dir ./data/cat --unzip --delete" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "covQskXXw2ul" + }, + "source": [ + "This dataset is automatically downloaded to the `./data/cat` dir with the following directory structure:\n", + "\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The cat dataset is located in the mmyolo project dir, and `data/cat/annotations` stores annotations in COCO format, and `data/cat/images` stores all images\n", + "\n", + "## Config\n", + "\n", + "Taking YOLOv5 algorithm as an example, considering the limited GPU memory of users, we need to modify some default training parameters to make them run smoothly. The key parameters to be modified are as follows:\n", + "\n", + "- YOLOv5 is an Anchor-Based algorithm, and different datasets need to calculate suitable anchors adaptively\n", + "- The default config uses 8 GPUs with a batch size of 16 per GPU. Now change it to a single GPU with a batch size of 12.\n", + "- The default training epoch is 300. Change it to 40 epoch\n", + "- Given the small size of the dataset, we opted to use fixed backbone weights\n", + "- In principle, the learning rate should be linearly scaled accordingly when the batch size is changed, but actual measurements have found that this is not necessary\n", + "\n", + "Create a `yolov5_s-v61_fast_1xb12-40e_cat.py` config file in the `configs/yolov5` folder (we have provided this config for you to use directly) and copy the following into the config file.\n", + "\n", + "```python\n", + "# Inherit and overwrite part of the config based on this config\n", + "_base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'\n", + "\n", + "data_root = './data/cat/' # dataset root\n", + "class_name = ('cat', ) # dataset category name\n", + "num_classes = len(class_name) # dataset category number\n", + "# metainfo is a configuration that must be passed to the dataloader, otherwise it is invalid\n", + "# palette is a display color for category at visualization\n", + "# The palette length must be greater than or equal to the length of the classes\n", + "metainfo = dict(classes=class_name, palette=[(20, 220, 60)])\n", + "\n", + "# Adaptive anchor based on tools/analysis_tools/optimize_anchors.py\n", + "anchors = [\n", + " [(68, 69), (154, 91), (143, 162)], # P3/8\n", + " [(242, 160), (189, 287), (391, 207)], # P4/16\n", + " [(353, 337), (539, 341), (443, 432)] # P5/32\n", + "]\n", + "# Max training 40 epoch\n", + "max_epochs = 40\n", + "# bs = 12\n", + "train_batch_size_per_gpu = 12\n", + "# dataloader num workers\n", + "train_num_workers = 4\n", + "\n", + "# load COCO pre-trained weight\n", + "load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa\n", + "\n", + "model = dict(\n", + " # Fixed the weight of the entire backbone without training\n", + " backbone=dict(frozen_stages=4),\n", + " bbox_head=dict(\n", + " head_module=dict(num_classes=num_classes),\n", + " prior_generator=dict(base_sizes=anchors)\n", + " ))\n", + "\n", + "train_dataloader = dict(\n", + " batch_size=train_batch_size_per_gpu,\n", + " num_workers=train_num_workers,\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " metainfo=metainfo,\n", + " # Dataset annotation file of json path\n", + " ann_file='annotations/trainval.json',\n", + " # Dataset prefix\n", + " data_prefix=dict(img='images/')))\n", + "\n", + "val_dataloader = dict(\n", + " dataset=dict(\n", + " metainfo=metainfo,\n", + " data_root=data_root,\n", + " ann_file='annotations/test.json',\n", + " data_prefix=dict(img='images/')))\n", + "\n", + "test_dataloader = val_dataloader\n", + "\n", + "_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu\n", + "\n", + "val_evaluator = dict(ann_file=data_root + 'annotations/test.json')\n", + "test_evaluator = val_evaluator\n", + "\n", + "default_hooks = dict(\n", + " # Save weights every 10 epochs and a maximum of two weights can be saved.\n", + " # The best model is saved automatically during model evaluation\n", + " checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'),\n", + " # The warmup_mim_iter parameter is critical.\n", + " # The default value is 1000 which is not suitable for cat datasets.\n", + " param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10),\n", + " # The log printing interval is 5\n", + " logger=dict(type='LoggerHook', interval=5))\n", + "# The evaluation interval is 10\n", + "train_cfg = dict(max_epochs=max_epochs, val_interval=10)\n", + "```\n", + "\n", + "The above config is inherited from `yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`. According to the characteristics of cat dataset updated `data_root`, `metainfo`, `train_dataloader`, `val_dataloader`, `num_classes` and other config.\n", + "\n", + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "TQ0h6sv_rJxq" + }, + "source": [ + "Run the above training command, `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat` folder will be automatically generated, the checkpoint file and the training config file will be saved in this folder. On a low-end 1660 GPU, the entire training process takes about eight minutes.\n", + "\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The performance on `test.json` is as follows:\n", + "\n", + "```text\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.631\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.909\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.747\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.631\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.627\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.703\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.703\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.703\n", + "```\n", + "\n", + "The above properties are printed via the COCO API, where -1 indicates that no object exists for the scale. According to the rules defined by COCO, the Cat dataset contains all large sized objects, and there are no small or medium-sized objects.\n", + "\n", + "### Some Notes\n", + "\n", + "Two key warnings are printed during training:\n", + "\n", + "- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon.\n", + "- The model and loaded state dict do not match exactly\n", + "\n", + "Neither of these warnings will have any impact on performance. The first warning is because the `num_classes` currently trained is 1, the loss of the classification branch is always 0 according to the community of the YOLOv5 algorithm, which is a normal phenomenon. The second warning is because we are currently training in fine-tuning mode, we load the COCO pre-trained weights for 80 classes,\n", + "This will lead to the final Head module convolution channel number does not correspond, resulting in this part of the weight can not be loaded, which is also a normal phenomenon.\n", + "\n", + "### Training is resumed after the interruption\n", + "\n", + "If you stop training, you can add `--resume` to the end of the training command and the program will automatically resume training with the latest weights file from `work_dirs`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --resume" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "3sJxvQoUrMhX" + }, + "source": [ + "### Save GPU memory strategy\n", + "\n", + "The above config requires about 3G RAM, so if you don't have enough, consider turning on mixed-precision training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --amp" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "jVJdyHTxrQ9a" + }, + "source": [ + "### Training visualization\n", + "\n", + "MMYOLO currently supports local, TensorBoard, WandB and other back-end visualization. The default is to use local visualization, and you can switch to WandB and other real-time visualization of various indicators in the training process.\n", + "\n", + "#### 1 WandB\n", + "\n", + "WandB visualization need registered in website, and in the https://wandb.ai/settings for wandb API Keys.\n", + "\n", + "
\n", + "\"image\"/\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install wandb\n", + "# After running wandb login, enter the API Keys obtained above, and the login is successful.\n", + "!wandb login" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Yu0_4YYRrbyY" + }, + "source": [ + "Add the wandb config at the end of config file we just created: `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py`.\n", + "\n", + "```python\n", + "visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')])\n", + "```\n", + "\n", + "Running the training command and you will see the loss, learning rate, and coco/bbox_mAP visualizations in the link." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "f_DyzfDIzwMa" + }, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "#### 2 Tensorboard\n", + "\n", + "Install Tensorboard using the following command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "gHkGlii3n29Q" + }, + "outputs": [], + "source": [ + "%pip install tensorboard" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "bE-nx9TY1P-M" + }, + "source": [ + "Add the `tensorboard` config at the end of config file we just created: `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py`.\n", + "\n", + "```python\n", + "visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')])\n", + "```\n", + "\n", + "After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/vis_data`.\n", + "We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "g8fZgokho5CE" + }, + "outputs": [], + "source": [ + "!tensorboard --logdir=work_dirs/yolov5_s-v61_fast_1xb12-40e_cat" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "GUZ7MPoaro-o" + }, + "source": [ + "## Testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VYmxtE0GunTB", + "outputId": "f440807c-1931-4810-b76d-617f73fde227" + }, + "outputs": [], + "source": [ + "!python tools/test.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --show-dir show_results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "_cFocUqN0BCb" + }, + "source": [ + "Run the above test command, you can not only get the AP performance printed in the **Training** section, You can also automatically save the result images to the `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/show_results` folder. Below is one of the result images, the left image is the actual annotation, and the right image is the inference result of the model.\n", + "\n", + "
\n", + "\"result_img\"/\n", + "
\n", + "\n", + "You can also visualize model inference results in a browser window if you use 'WandbVisBackend' or 'TensorboardVisBackend'.\n", + "\n", + "## Feature map visualization\n", + "\n", + "MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md)\n", + "\n", + "Due to the bias of direct visualization of `test_pipeline`, we need modify the `test_pipeline` of `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py`,\n", + "\n", + "```python\n", + "test_pipeline = [\n", + " dict(\n", + " type='LoadImageFromFile',\n", + " file_client_args=_base_.file_client_args),\n", + " dict(type='YOLOv5KeepRatioResize', scale=img_scale),\n", + " dict(\n", + " type='LetterResize',\n", + " scale=img_scale,\n", + " allow_scale_up=False,\n", + " pad_val=dict(img=114)),\n", + " dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n", + " dict(\n", + " type='mmdet.PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor', 'pad_param'))\n", + "]\n", + "```\n", + "\n", + "to the following config:\n", + "\n", + "```python\n", + "test_pipeline = [\n", + " dict(\n", + " type='LoadImageFromFile',\n", + " file_client_args=_base_.file_client_args),\n", + " dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize\n", + " dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n", + " dict(\n", + " type='mmdet.PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + "]\n", + "```\n", + "\n", + "Let's choose the `data/cat/images/IMG_20221020_112705.jpg` image as an example to visualize the output feature maps of YOLOv5 backbone and neck layers.\n", + "\n", + "**1. Visualize the three channels of YOLOv5 backbone**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layers backbone \\\n", + " --channel-reduction squeeze_mean" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The result will be saved to the output folder in current path. Three output feature maps plotted in the above figure correspond to small, medium and large output feature maps. As the backbone of this training is not actually involved in training, it can be seen from the above figure that the big object cat is predicted on the small feature map, which is in line with the idea of hierarchical detection of object detection.\n", + "\n", + "**2. Visualize the three channels of YOLOv5 neck**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layers neck \\\n", + " --channel-reduction squeeze_mean" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "As can be seen from the above figure, because neck is involved in training, and we also reset anchor, the three output feature maps are forced to simulate the same scale object, resulting in the three output maps of neck are similar, which destroys the original pre-training distribution of backbone. At the same time, it can also be seen that 40 epochs are not enough to train the above dataset, and the feature maps do not perform well.\n", + "\n", + "**3. Grad-Based CAM visualization**\n", + "\n", + "Based on the above feature map visualization, we can analyze Grad CAM at the feature layer of bbox level.\n", + "\n", + "Install `grad-cam` package:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install \"grad-cam\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(a) View Grad CAM of the minimum output feature map of the neck" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layer neck.out_layers[2]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "9v-dMkePvHMg" + }, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "(b) View Grad CAM of the medium output feature map of the neck" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "p9H9u0A-3KAD", + "outputId": "32ca5a56-052f-4930-f53c-41cc3a9dc619" + }, + "outputs": [], + "source": [ + "!python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layer neck.out_layers[1]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(c) View Grad CAM of the maximum output feature map of the neck" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MrKan1U43uUY", + "outputId": "690f8414-a76b-4fa6-e600-7cc874ce1914" + }, + "outputs": [], + "source": [ + "!python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layer neck.out_layers[0]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "## EasyDeploy deployment\n", + "\n", + "Here we'll use MMYOLO's [EasyDeploy](../../../projects/easydeploy/) to demonstrate the transformation deployment and basic inference of model.\n", + "\n", + "First you need to follow EasyDeploy's [basic documentation](../../../projects/easydeploy/docs/model_convert.md) controls own equipment installed for each library.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install onnx\n", + "%pip install onnx-simplifier # Install if you want to use simplify\n", + "%pip install tensorrt # If you have GPU environment and need to output TensorRT model you need to continue execution" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once installed, you can use the following command to transform and deploy the trained model on the cat dataset with one click. The current ONNX version is 1.13.0 and TensorRT version is 8.5.3.1, so keep the `--opset` value of 11. The remaining parameters need to be adjusted according to the config used. Here we export the CPU version of ONNX with the `--backend` set to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 534 + }, + "id": "YsRFEecU5C0w", + "outputId": "c26011d4-2836-4715-cd6b-68836294db33" + }, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/export.py \\\n", + "\t configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + "\t work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + "\t --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \\\n", + " --img-size 640 640 \\\n", + " --batch 1 \\\n", + " --device cpu \\\n", + " --simplify \\\n", + "\t --opset 11 \\\n", + "\t --backend 1 \\\n", + "\t --pre-topk 1000 \\\n", + "\t --keep-topk 100 \\\n", + "\t --iou-threshold 0.65 \\\n", + "\t --score-threshold 0.25\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "q1EY415x3Idx" + }, + "source": [ + "On success, you will get the converted ONNX model under `work-dir`, which is named `end2end.onnx` by default.\n", + "\n", + "Let's use `end2end.onnx` model to perform a basic image inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/image-demo.py \\\n", + " data/cat/images/IMG_20210728_205312.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \\\n", + " --device cpu" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "IrjiBa5YwDQM" + }, + "source": [ + "After successful inference, the result image will be generated in the `output` folder of the default MMYOLO root directory. If you want to see the result without saving it, you can add `--show` to the end of the above command. For convenience, the following is the generated result.\n", + "\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "Let's go on to convert the engine file for TensorRT, because TensorRT needs to be specific to the current environment and deployment version, so make sure to export the parameters, here we export the TensorRT8 file, the `--backend` is 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d8zxczqiBLoB" + }, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/export.py \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \\\n", + " --img-size 640 640 \\\n", + " --batch 1 \\\n", + " --device cuda:0 \\\n", + " --simplify \\\n", + " --opset 11 \\\n", + " --backend 2 \\\n", + " --pre-topk 1000 \\\n", + " --keep-topk 100 \\\n", + " --iou-threshold 0.65 \\\n", + " --score-threshold 0.25" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The resulting `end2end.onnx` is the ONNX file for the TensorRT8 deployment, which we will use to complete the TensorRT engine transformation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "QFh8rIsX_kVw", + "outputId": "c5bd6929-03a8-400e-be1e-581f32b23f61" + }, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/build_engine.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \\\n", + " --img-size 640 640 \\\n", + " --device cuda:0" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Successful execution will generate the `end2end.engine` file under `work-dir`:\n", + "\n", + "```shell\n", + "work_dirs/yolov5_s-v61_fast_1xb12-40e_cat\n", + "├── 202302XX_XXXXXX\n", + "│ ├── 202302XX_XXXXXX.log\n", + "│ └── vis_data\n", + "│ ├── 202302XX_XXXXXX.json\n", + "│ ├── config.py\n", + "│ └── scalars.json\n", + "├── best_coco\n", + "│ └── bbox_mAP_epoch_40.pth\n", + "├── end2end.engine\n", + "├── end2end.onnx\n", + "├── epoch_30.pth\n", + "├── epoch_40.pth\n", + "├── last_checkpoint\n", + "└── yolov5_s-v61_fast_1xb12-40e_cat.py\n", + "```\n", + "\n", + "Let's continue use `image-demo.py` for image inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "rOqXEi-jAI7Y", + "outputId": "2a21aaaa-d4ba-498a-f985-2a6a2b8d348f" + }, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/image-demo.py \\\n", + " data/cat/images/IMG_20210728_205312.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.engine \\\n", + " --device cuda:0" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "ocHGUUEA_TjI" + }, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "This completes the transformation deployment of the trained model and checks the inference results. This is the end of the tutorial.\n", + "\n", + "If you encounter problems during training or testing, please check the [common troubleshooting steps](https://mmyolo.readthedocs.io/en/dev/recommended_topics/troubleshooting_steps.html) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [], + "toc_visible": true + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/models/YOLO-World/third_party/mmyolo/demo/boxam_vis_demo.py b/models/YOLO-World/third_party/mmyolo/demo/boxam_vis_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..278574f89fe5427cb5be7b9a7fd99f70de090bd4 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/demo/boxam_vis_demo.py @@ -0,0 +1,276 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This script is in the experimental verification stage and cannot be +guaranteed to be completely correct. Currently Grad-based CAM and Grad-free CAM +are supported. + +The target detection task is different from the classification task. It not +only includes the AM map of the category, but also includes information such as +bbox and mask, so this script is named bboxam. +""" + +import argparse +import os.path +import warnings +from functools import partial + +import cv2 +import mmcv +from mmengine import Config, DictAction, MessageHub +from mmengine.utils import ProgressBar + +try: + from pytorch_grad_cam import AblationCAM, EigenCAM +except ImportError: + raise ImportError('Please run `pip install "grad-cam"` to install ' + 'pytorch_grad_cam package.') + +from mmyolo.utils.boxam_utils import (BoxAMDetectorVisualizer, + BoxAMDetectorWrapper, DetAblationLayer, + DetBoxScoreTarget, GradCAM, + GradCAMPlusPlus, reshape_transform) +from mmyolo.utils.misc import get_file_list + +GRAD_FREE_METHOD_MAP = { + 'ablationcam': AblationCAM, + 'eigencam': EigenCAM, + # 'scorecam': ScoreCAM, # consumes too much memory +} + +GRAD_BASED_METHOD_MAP = {'gradcam': GradCAM, 'gradcam++': GradCAMPlusPlus} + +ALL_SUPPORT_METHODS = list(GRAD_FREE_METHOD_MAP.keys() + | GRAD_BASED_METHOD_MAP.keys()) + +IGNORE_LOSS_PARAMS = { + 'yolov5': ['loss_obj'], + 'yolov6': ['loss_cls'], + 'yolox': ['loss_obj'], + 'rtmdet': ['loss_cls'], + 'yolov7': ['loss_obj'], + 'yolov8': ['loss_cls'], + 'ppyoloe': ['loss_cls'], +} + +# This parameter is required in some algorithms +# for calculating Loss +message_hub = MessageHub.get_current_instance() +message_hub.runtime_info['epoch'] = 0 + + +def parse_args(): + parser = argparse.ArgumentParser(description='Visualize Box AM') + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--method', + default='gradcam', + choices=ALL_SUPPORT_METHODS, + help='Type of method to use, supports ' + f'{", ".join(ALL_SUPPORT_METHODS)}.') + parser.add_argument( + '--target-layers', + default=['neck.out_layers[2]'], + nargs='+', + type=str, + help='The target layers to get Box AM, if not set, the tool will ' + 'specify the neck.out_layers[2]') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--show', action='store_true', help='Show the CAM results') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--topk', + type=int, + default=-1, + help='Select topk predict resutls to show. -1 are mean all.') + parser.add_argument( + '--max-shape', + nargs='+', + type=int, + default=-1, + help='max shapes. Its purpose is to save GPU memory. ' + 'The activation map is scaled and then evaluated. ' + 'If set to -1, it means no scaling.') + parser.add_argument( + '--preview-model', + default=False, + action='store_true', + help='To preview all the model layers') + parser.add_argument( + '--norm-in-bbox', action='store_true', help='Norm in bbox of am image') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + # Only used by AblationCAM + parser.add_argument( + '--batch-size', + type=int, + default=1, + help='batch of inference of AblationCAM') + parser.add_argument( + '--ratio-channels-to-ablate', + type=int, + default=0.5, + help='Making it much faster of AblationCAM. ' + 'The parameter controls how many channels should be ablated') + + args = parser.parse_args() + return args + + +def init_detector_and_visualizer(args, cfg): + max_shape = args.max_shape + if not isinstance(max_shape, list): + max_shape = [args.max_shape] + assert len(max_shape) == 1 or len(max_shape) == 2 + + model_wrapper = BoxAMDetectorWrapper( + cfg, args.checkpoint, args.score_thr, device=args.device) + + if args.preview_model: + print(model_wrapper.detector) + print('\n Please remove `--preview-model` to get the BoxAM.') + return None, None + + target_layers = [] + for target_layer in args.target_layers: + try: + target_layers.append( + eval(f'model_wrapper.detector.{target_layer}')) + except Exception as e: + print(model_wrapper.detector) + raise RuntimeError('layer does not exist', e) + + ablationcam_extra_params = { + 'batch_size': args.batch_size, + 'ablation_layer': DetAblationLayer(), + 'ratio_channels_to_ablate': args.ratio_channels_to_ablate + } + + if args.method in GRAD_BASED_METHOD_MAP: + method_class = GRAD_BASED_METHOD_MAP[args.method] + is_need_grad = True + else: + method_class = GRAD_FREE_METHOD_MAP[args.method] + is_need_grad = False + + boxam_detector_visualizer = BoxAMDetectorVisualizer( + method_class, + model_wrapper, + target_layers, + reshape_transform=partial( + reshape_transform, max_shape=max_shape, is_need_grad=is_need_grad), + is_need_grad=is_need_grad, + extra_params=ablationcam_extra_params) + return model_wrapper, boxam_detector_visualizer + + +def main(): + args = parse_args() + + # hard code + ignore_loss_params = None + for param_keys in IGNORE_LOSS_PARAMS: + if param_keys in args.config: + print(f'The algorithm currently used is {param_keys}') + ignore_loss_params = IGNORE_LOSS_PARAMS[param_keys] + break + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + model_wrapper, boxam_detector_visualizer = init_detector_and_visualizer( + args, cfg) + + # get file list + image_list, source_type = get_file_list(args.img) + + progress_bar = ProgressBar(len(image_list)) + + for image_path in image_list: + image = cv2.imread(image_path) + model_wrapper.set_input_data(image) + + # forward detection results + result = model_wrapper()[0] + + pred_instances = result.pred_instances + # Get candidate predict info with score threshold + pred_instances = pred_instances[pred_instances.scores > args.score_thr] + + if len(pred_instances) == 0: + warnings.warn('empty detection results! skip this') + continue + + if args.topk > 0: + pred_instances = pred_instances[:args.topk] + + targets = [ + DetBoxScoreTarget( + pred_instances, + device=args.device, + ignore_loss_params=ignore_loss_params) + ] + + if args.method in GRAD_BASED_METHOD_MAP: + model_wrapper.need_loss(True) + model_wrapper.set_input_data(image, pred_instances) + boxam_detector_visualizer.switch_activations_and_grads( + model_wrapper) + + # get box am image + grayscale_boxam = boxam_detector_visualizer(image, targets=targets) + + # draw cam on image + pred_instances = pred_instances.numpy() + image_with_bounding_boxes = boxam_detector_visualizer.show_am( + image, + pred_instances, + grayscale_boxam, + with_norm_in_bboxes=args.norm_in_bbox) + + if source_type['is_dir']: + filename = os.path.relpath(image_path, args.img).replace('/', '_') + else: + filename = os.path.basename(image_path) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + if out_file: + mmcv.imwrite(image_with_bounding_boxes, out_file) + else: + cv2.namedWindow(filename, 0) + cv2.imshow(filename, image_with_bounding_boxes) + cv2.waitKey(0) + + # switch + if args.method in GRAD_BASED_METHOD_MAP: + model_wrapper.need_loss(False) + boxam_detector_visualizer.switch_activations_and_grads( + model_wrapper) + + progress_bar.update() + + if not args.show: + print(f'All done!' + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/demo/deploy_demo.py b/models/YOLO-World/third_party/mmyolo/demo/deploy_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..f5d08df47fc9740bc1d2ca837d5188f8b4eac267 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/demo/deploy_demo.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Deploy demo for mmdeploy. + +This script help user to run mmdeploy demo after convert the +checkpoint to backends. + +Usage: + python deploy_demo.py img \ + config \ + checkpoint \ + [--deploy-cfg DEPLOY_CFG] \ + [--device DEVICE] \ + [--out-dir OUT_DIR] \ + [--show] \ + [--score-thr SCORE_THR] + +Example: + python deploy_demo.py \ + ${MMYOLO_PATH}/data/cat/images \ + ./yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + ./end2end.engine \ + --deploy-cfg ./detection_tensorrt-fp16_dynamic-192x192-960x960.py \ + --out-dir ${MMYOLO_PATH}/work_dirs/deploy_predict_out \ + --device cuda:0 \ + --score-thr 0.5 +""" +import argparse +import os + +import torch +from mmengine import ProgressBar + +from mmyolo.utils.misc import get_file_list + +try: + from mmdeploy.apis.utils import build_task_processor + from mmdeploy.utils import get_input_shape, load_config +except ImportError: + raise ImportError( + 'mmdeploy is not installed, please see ' + 'https://mmdeploy.readthedocs.io/en/1.x/01-how-to-build/build_from_source.html' # noqa + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description='For mmdeploy predict') + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='model config root') + parser.add_argument('checkpoint', help='checkpoint backend model path') + parser.add_argument('--deploy-cfg', help='deploy config path') + parser.add_argument( + '--device', default='cuda:0', help='device used for conversion') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + args = parser.parse_args() + return args + + +# TODO Still need to refactor to not building dataset. +def main(): + args = parse_args() + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + # read deploy_cfg and config + deploy_cfg, model_cfg = load_config(args.deploy_cfg, args.config) + + # build task and backend model + task_processor = build_task_processor(model_cfg, deploy_cfg, args.device) + model = task_processor.build_backend_model([args.checkpoint]) + + # get model input shape + input_shape = get_input_shape(deploy_cfg) + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + progress_bar = ProgressBar(len(files)) + for file in files: + # process input image + model_inputs, _ = task_processor.create_input(file, input_shape) + + # do model inference + with torch.no_grad(): + result = model.test_step(model_inputs) + + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # filter score + result = result[0] + result.pred_instances = result.pred_instances[ + result.pred_instances.scores > args.score_thr] + + # visualize results + task_processor.visualize( + image=file, + model=model, + result=result, + show_result=args.show, + window_name=os.path.basename(filename), + output_file=out_file) + + progress_bar.update() + + print('All done!') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/demo/featmap_vis_demo.py b/models/YOLO-World/third_party/mmyolo/demo/featmap_vis_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..892e73d616b0e629ddfcc276e8eb4ca289f5085b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/demo/featmap_vis_demo.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +from typing import Sequence + +import mmcv +from mmdet.apis import inference_detector, init_detector +from mmengine import Config, DictAction +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar + +from mmyolo.registry import VISUALIZERS +from mmyolo.utils.misc import auto_arrange_images, get_file_list + + +def parse_args(): + parser = argparse.ArgumentParser(description='Visualize feature map') + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--target-layers', + default=['backbone'], + nargs='+', + type=str, + help='The target layers to get feature map, if not set, the tool will ' + 'specify the backbone') + parser.add_argument( + '--preview-model', + default=False, + action='store_true', + help='To preview all the model layers') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--show', action='store_true', help='Show the featmap results') + parser.add_argument( + '--channel-reduction', + default='select_max', + help='Reduce multiple channels to a single channel') + parser.add_argument( + '--topk', + type=int, + default=4, + help='Select topk channel to show by the sum of each channel') + parser.add_argument( + '--arrangement', + nargs='+', + type=int, + default=[2, 2], + help='The arrangement of featmap when channel_reduction is ' + 'not None and topk > 0') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +class ActivationsWrapper: + + def __init__(self, model, target_layers): + self.model = model + self.activations = [] + self.handles = [] + self.image = None + for target_layer in target_layers: + self.handles.append( + target_layer.register_forward_hook(self.save_activation)) + + def save_activation(self, module, input, output): + self.activations.append(output) + + def __call__(self, img_path): + self.activations = [] + results = inference_detector(self.model, img_path) + return results, self.activations + + def release(self): + for handle in self.handles: + handle.remove() + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + channel_reduction = args.channel_reduction + if channel_reduction == 'None': + channel_reduction = None + assert len(args.arrangement) == 2 + + model = init_detector(args.config, args.checkpoint, device=args.device) + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + if args.preview_model: + print(model) + print('\n This flag is only show model, if you want to continue, ' + 'please remove `--preview-model` to get the feature map.') + return + + target_layers = [] + for target_layer in args.target_layers: + try: + target_layers.append(eval(f'model.{target_layer}')) + except Exception as e: + print(model) + raise RuntimeError('layer does not exist', e) + + activations_wrapper = ActivationsWrapper(model, target_layers) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.dataset_meta = model.dataset_meta + + # get file list + image_list, source_type = get_file_list(args.img) + + progress_bar = ProgressBar(len(image_list)) + for image_path in image_list: + result, featmaps = activations_wrapper(image_path) + if not isinstance(featmaps, Sequence): + featmaps = [featmaps] + + flatten_featmaps = [] + for featmap in featmaps: + if isinstance(featmap, Sequence): + flatten_featmaps.extend(featmap) + else: + flatten_featmaps.append(featmap) + + img = mmcv.imread(image_path) + img = mmcv.imconvert(img, 'bgr', 'rgb') + + if source_type['is_dir']: + filename = os.path.relpath(image_path, args.img).replace('/', '_') + else: + filename = os.path.basename(image_path) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # show the results + shown_imgs = [] + visualizer.add_datasample( + 'result', + img, + data_sample=result, + draw_gt=False, + show=False, + wait_time=0, + out_file=None, + pred_score_thr=args.score_thr) + drawn_img = visualizer.get_image() + + for featmap in flatten_featmaps: + shown_img = visualizer.draw_featmap( + featmap[0], + drawn_img, + channel_reduction=channel_reduction, + topk=args.topk, + arrangement=args.arrangement) + shown_imgs.append(shown_img) + + shown_imgs = auto_arrange_images(shown_imgs) + + progress_bar.update() + if out_file: + mmcv.imwrite(shown_imgs[..., ::-1], out_file) + + if args.show: + visualizer.show(shown_imgs) + + if not args.show: + print(f'All done!' + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + +# Please refer to the usage tutorial: +# https://github.com/open-mmlab/mmyolo/blob/main/docs/zh_cn/user_guides/visualization.md # noqa +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/demo/image_demo.py b/models/YOLO-World/third_party/mmyolo/demo/image_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..fa2cfb2a03f7e8328dd068851433d69c9f4a0db5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/demo/image_demo.py @@ -0,0 +1,168 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from argparse import ArgumentParser +from pathlib import Path + +import mmcv +from mmdet.apis import inference_detector, init_detector +from mmengine.config import Config, ConfigDict +from mmengine.logging import print_log +from mmengine.utils import ProgressBar, path + +from mmyolo.registry import VISUALIZERS +from mmyolo.utils import switch_to_deploy +from mmyolo.utils.labelme_utils import LabelmeFormat +from mmyolo.utils.misc import get_file_list, show_data_classes + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--deploy', + action='store_true', + help='Switch model to deployment mode') + parser.add_argument( + '--tta', + action='store_true', + help='Whether to use test time augmentation') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--class-name', + nargs='+', + type=str, + help='Only Save those classes if set') + parser.add_argument( + '--to-labelme', + action='store_true', + help='Output labelme style label file') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + if args.to_labelme and args.show: + raise RuntimeError('`--to-labelme` or `--show` only ' + 'can choose one at the same time.') + config = args.config + + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + + if args.tta: + assert 'tta_model' in config, 'Cannot find ``tta_model`` in config.' \ + " Can't use tta !" + assert 'tta_pipeline' in config, 'Cannot find ``tta_pipeline`` ' \ + "in config. Can't use tta !" + config.model = ConfigDict(**config.tta_model, module=config.model) + test_data_cfg = config.test_dataloader.dataset + while 'dataset' in test_data_cfg: + test_data_cfg = test_data_cfg['dataset'] + + # batch_shapes_cfg will force control the size of the output image, + # it is not compatible with tta. + if 'batch_shapes_cfg' in test_data_cfg: + test_data_cfg.batch_shapes_cfg = None + test_data_cfg.pipeline = config.tta_pipeline + + # TODO: TTA mode will error if cfg_options is not set. + # This is an mmdet issue and needs to be fixed later. + # build the model from a config file and a checkpoint file + model = init_detector( + config, args.checkpoint, device=args.device, cfg_options={}) + + if args.deploy: + switch_to_deploy(model) + + if not args.show: + path.mkdir_or_exist(args.out_dir) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.dataset_meta = model.dataset_meta + + # get file list + files, source_type = get_file_list(args.img) + + # get model class name + dataset_classes = model.dataset_meta.get('classes') + + # ready for labelme format if it is needed + to_label_format = LabelmeFormat(classes=dataset_classes) + + # check class name + if args.class_name is not None: + for class_name in args.class_name: + if class_name in dataset_classes: + continue + show_data_classes(dataset_classes) + raise RuntimeError( + 'Expected args.class_name to be one of the list, ' + f'but got "{class_name}"') + + # start detector inference + progress_bar = ProgressBar(len(files)) + for file in files: + result = inference_detector(model, file) + + img = mmcv.imread(file) + img = mmcv.imconvert(img, 'bgr', 'rgb') + + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + progress_bar.update() + + # Get candidate predict info with score threshold + pred_instances = result.pred_instances[ + result.pred_instances.scores > args.score_thr] + + if args.to_labelme: + # save result to labelme files + out_file = out_file.replace( + os.path.splitext(out_file)[-1], '.json') + to_label_format(pred_instances, result.metainfo, out_file, + args.class_name) + continue + + visualizer.add_datasample( + filename, + img, + data_sample=result, + draw_gt=False, + show=args.show, + wait_time=0, + out_file=out_file, + pred_score_thr=args.score_thr) + + if not args.show and not args.to_labelme: + print_log( + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + elif args.to_labelme: + print_log('\nLabelme format label files ' + f'had all been saved in {args.out_dir}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/demo/large_image_demo.py b/models/YOLO-World/third_party/mmyolo/demo/large_image_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..bdbc3a56d0056c3965fac28c49e18b31355a2029 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/demo/large_image_demo.py @@ -0,0 +1,294 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Perform MMYOLO inference on large images (as satellite imagery) as: + +```shell +wget -P checkpoint https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth # noqa: E501, E261. + +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + checkpoint/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth +``` +""" + +import os +import random +from argparse import ArgumentParser +from pathlib import Path + +import mmcv +import numpy as np +from mmdet.apis import inference_detector, init_detector +from mmengine.config import Config, ConfigDict +from mmengine.logging import print_log +from mmengine.utils import ProgressBar + +try: + from sahi.slicing import slice_image +except ImportError: + raise ImportError('Please run "pip install -U sahi" ' + 'to install sahi first for large image inference.') + +from mmyolo.registry import VISUALIZERS +from mmyolo.utils import switch_to_deploy +from mmyolo.utils.large_image import merge_results_by_nms, shift_predictions +from mmyolo.utils.misc import get_file_list + + +def parse_args(): + parser = ArgumentParser( + description='Perform MMYOLO inference on large images.') + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--deploy', + action='store_true', + help='Switch model to deployment mode') + parser.add_argument( + '--tta', + action='store_true', + help='Whether to use test time augmentation') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--patch-size', type=int, default=640, help='The size of patches') + parser.add_argument( + '--patch-overlap-ratio', + type=float, + default=0.25, + help='Ratio of overlap between two patches') + parser.add_argument( + '--merge-iou-thr', + type=float, + default=0.25, + help='IoU threshould for merging results') + parser.add_argument( + '--merge-nms-type', + type=str, + default='nms', + help='NMS type for merging results') + parser.add_argument( + '--batch-size', + type=int, + default=1, + help='Batch size, must greater than or equal to 1') + parser.add_argument( + '--debug', + action='store_true', + help='Export debug results before merging') + parser.add_argument( + '--save-patch', + action='store_true', + help='Save the results of each patch. ' + 'The `--debug` must be enabled.') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + config = args.config + + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + + if args.tta: + assert 'tta_model' in config, 'Cannot find ``tta_model`` in config.' \ + " Can't use tta !" + assert 'tta_pipeline' in config, 'Cannot find ``tta_pipeline`` ' \ + "in config. Can't use tta !" + config.model = ConfigDict(**config.tta_model, module=config.model) + test_data_cfg = config.test_dataloader.dataset + while 'dataset' in test_data_cfg: + test_data_cfg = test_data_cfg['dataset'] + + # batch_shapes_cfg will force control the size of the output image, + # it is not compatible with tta. + if 'batch_shapes_cfg' in test_data_cfg: + test_data_cfg.batch_shapes_cfg = None + test_data_cfg.pipeline = config.tta_pipeline + + # TODO: TTA mode will error if cfg_options is not set. + # This is an mmdet issue and needs to be fixed later. + # build the model from a config file and a checkpoint file + model = init_detector( + config, args.checkpoint, device=args.device, cfg_options={}) + + if args.deploy: + switch_to_deploy(model) + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.dataset_meta = model.dataset_meta + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + print(f'Performing inference on {len(files)} images.... ' + 'This may take a while.') + progress_bar = ProgressBar(len(files)) + for file in files: + # read image + img = mmcv.imread(file) + + # arrange slices + height, width = img.shape[:2] + sliced_image_object = slice_image( + img, + slice_height=args.patch_size, + slice_width=args.patch_size, + auto_slice_resolution=False, + overlap_height_ratio=args.patch_overlap_ratio, + overlap_width_ratio=args.patch_overlap_ratio, + ) + + # perform sliced inference + slice_results = [] + start = 0 + while True: + # prepare batch slices + end = min(start + args.batch_size, len(sliced_image_object)) + images = [] + for sliced_image in sliced_image_object.images[start:end]: + images.append(sliced_image) + + # forward the model + slice_results.extend(inference_detector(model, images)) + + if end >= len(sliced_image_object): + break + start += args.batch_size + + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + + img = mmcv.imconvert(img, 'bgr', 'rgb') + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # export debug images + if args.debug: + # export sliced image results + name, suffix = os.path.splitext(filename) + + shifted_instances = shift_predictions( + slice_results, + sliced_image_object.starting_pixels, + src_image_shape=(height, width)) + merged_result = slice_results[0].clone() + merged_result.pred_instances = shifted_instances + + debug_file_name = name + '_debug' + suffix + debug_out_file = None if args.show else os.path.join( + args.out_dir, debug_file_name) + visualizer.set_image(img.copy()) + + debug_grids = [] + for starting_point in sliced_image_object.starting_pixels: + start_point_x = starting_point[0] + start_point_y = starting_point[1] + end_point_x = start_point_x + args.patch_size + end_point_y = start_point_y + args.patch_size + debug_grids.append( + [start_point_x, start_point_y, end_point_x, end_point_y]) + debug_grids = np.array(debug_grids) + debug_grids[:, 0::2] = np.clip(debug_grids[:, 0::2], 1, + img.shape[1] - 1) + debug_grids[:, 1::2] = np.clip(debug_grids[:, 1::2], 1, + img.shape[0] - 1) + + palette = np.random.randint(0, 256, size=(len(debug_grids), 3)) + palette = [tuple(c) for c in palette] + line_styles = random.choices(['-', '-.', ':'], k=len(debug_grids)) + visualizer.draw_bboxes( + debug_grids, + edge_colors=palette, + alpha=1, + line_styles=line_styles) + visualizer.draw_bboxes( + debug_grids, face_colors=palette, alpha=0.15) + + visualizer.draw_texts( + list(range(len(debug_grids))), + debug_grids[:, :2] + 5, + colors='w') + + visualizer.add_datasample( + debug_file_name, + visualizer.get_image(), + data_sample=merged_result, + draw_gt=False, + show=args.show, + wait_time=0, + out_file=debug_out_file, + pred_score_thr=args.score_thr, + ) + + if args.save_patch: + debug_patch_out_dir = os.path.join(args.out_dir, + f'{name}_patch') + for i, slice_result in enumerate(slice_results): + patch_out_file = os.path.join( + debug_patch_out_dir, + f'{filename}_slice_{i}_result.jpg') + image = mmcv.imconvert(sliced_image_object.images[i], + 'bgr', 'rgb') + + visualizer.add_datasample( + 'patch_result', + image, + data_sample=slice_result, + draw_gt=False, + show=False, + wait_time=0, + out_file=patch_out_file, + pred_score_thr=args.score_thr, + ) + + image_result = merge_results_by_nms( + slice_results, + sliced_image_object.starting_pixels, + src_image_shape=(height, width), + nms_cfg={ + 'type': args.merge_nms_type, + 'iou_threshold': args.merge_iou_thr + }) + + visualizer.add_datasample( + filename, + img, + data_sample=image_result, + draw_gt=False, + show=args.show, + wait_time=0, + out_file=out_file, + pred_score_thr=args.score_thr, + ) + progress_bar.update() + + if not args.show or (args.debug and args.save_patch): + print_log( + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/demo/video_demo.py b/models/YOLO-World/third_party/mmyolo/demo/video_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..d8317a2c6c777eaa9cc6aab27e55bf53efe9e8fd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/demo/video_demo.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Perform MMYOLO inference on a video as: + +```shell +wget -P checkpoint https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth # noqa: E501, E261. + +python demo/video_demo.py \ + demo/video_demo.mp4 \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + checkpoint/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --out demo_result.mp4 +``` +""" +import argparse + +import cv2 +import mmcv +from mmcv.transforms import Compose +from mmdet.apis import inference_detector, init_detector +from mmengine.utils import track_iter_progress + +from mmyolo.registry import VISUALIZERS + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMYOLO video demo') + parser.add_argument('video', help='Video file') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument('--out', type=str, help='Output video file') + parser.add_argument('--show', action='store_true', help='Show video') + parser.add_argument( + '--wait-time', + type=float, + default=1, + help='The interval of show (s), 0 is block') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + assert args.out or args.show, \ + ('Please specify at least one operation (save/show the ' + 'video) with the argument "--out" or "--show"') + + # build the model from a config file and a checkpoint file + model = init_detector(args.config, args.checkpoint, device=args.device) + + # build test pipeline + model.cfg.test_dataloader.dataset.pipeline[ + 0].type = 'mmdet.LoadImageFromNDArray' + test_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + # the dataset_meta is loaded from the checkpoint and + # then pass to the model in init_detector + visualizer.dataset_meta = model.dataset_meta + + video_reader = mmcv.VideoReader(args.video) + video_writer = None + if args.out: + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + video_writer = cv2.VideoWriter( + args.out, fourcc, video_reader.fps, + (video_reader.width, video_reader.height)) + + for frame in track_iter_progress(video_reader): + result = inference_detector(model, frame, test_pipeline=test_pipeline) + visualizer.add_datasample( + name='video', + image=frame, + data_sample=result, + draw_gt=False, + show=False, + pred_score_thr=args.score_thr) + frame = visualizer.get_image() + + if args.show: + cv2.namedWindow('video', 0) + mmcv.imshow(frame, 'video', args.wait_time) + if args.out: + video_writer.write(frame) + + if video_writer: + video_writer.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/docker/Dockerfile b/models/YOLO-World/third_party/mmyolo/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..fc65431a2940604118aaf747290442da78741365 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/docker/Dockerfile @@ -0,0 +1,36 @@ +ARG PYTORCH="1.9.0" +ARG CUDA="11.1" +ARG CUDNN="8" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \ + TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ + FORCE_CUDA="1" + +RUN rm /etc/apt/sources.list.d/cuda.list \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-key del 7fa2af80 \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# (Optional) +# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \ +# pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + +RUN apt-get update \ + && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install MMEngine , MMCV and MMDet +RUN pip install --no-cache-dir openmim && \ + mim install --no-cache-dir "mmengine>=0.6.0" "mmcv>=2.0.0rc4,<2.1.0" "mmdet>=3.0.0,<4.0.0" + +# Install MMYOLO +RUN git clone https://github.com/open-mmlab/mmyolo.git /mmyolo && \ + cd /mmyolo && \ + mim install --no-cache-dir -e . + +WORKDIR /mmyolo diff --git a/models/YOLO-World/third_party/mmyolo/docker/Dockerfile_deployment b/models/YOLO-World/third_party/mmyolo/docker/Dockerfile_deployment new file mode 100644 index 0000000000000000000000000000000000000000..8ea1e380b0fab494047f9e2f94545f4e4b0b72e9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/docker/Dockerfile_deployment @@ -0,0 +1,65 @@ +FROM nvcr.io/nvidia/pytorch:22.04-py3 + +WORKDIR /openmmlab +ARG ONNXRUNTIME_VERSION=1.8.1 +ENV DEBIAN_FRONTEND=noninteractive \ + APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn \ + FORCE_CUDA="1" + +RUN apt-key del 7fa2af80 \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# (Optional) +# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list \ +# && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + +RUN apt-get update \ + && apt-get install -y ffmpeg git libgl1-mesa-glx libopencv-dev \ + libsm6 libspdlog-dev libssl-dev ninja-build libxext6 libxrender-dev \ + libglib2.0-0 vim wget --no-install-recommends \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# get onnxruntime +RUN wget -q https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz \ + && tar -zxvf onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz \ + && pip install --no-cache-dir onnxruntime-gpu==${ONNXRUNTIME_VERSION} \ + && pip install pycuda + + +# Install OPENMIM MMENGINE MMDET +RUN pip install --no-cache-dir openmim \ + && mim install --no-cache-dir "mmengine>=0.6.0" "mmdet>=3.0.0,<4.0.0" \ + && mim install --no-cache-dir opencv-python==4.5.5.64 opencv-python-headless==4.5.5.64 + +RUN git clone https://github.com/open-mmlab/mmcv.git -b 2.x mmcv \ + && cd mmcv \ + && mim install --no-cache-dir -r requirements/optional.txt \ + && MMCV_WITH_OPS=1 mim install --no-cache-dir -e . -v \ + && cd .. + +# Install MMYOLO +RUN git clone https://github.com/open-mmlab/mmyolo.git -b dev mmyolo \ + && cd mmyolo \ + && mim install --no-cache-dir -e . \ + && cd .. + +# Install MMDEPLOY +ENV ONNXRUNTIME_DIR=/openmmlab/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION} \ + TENSORRT_DIR=/usr/lib/x86_64-linux-gnu \ + CUDNN_DIR=/usr/lib/x86_64-linux-gnu + +RUN git clone https://github.com/open-mmlab/mmdeploy -b dev-1.x mmdeploy \ + && cd mmdeploy \ + && git submodule update --init --recursive \ + && mkdir -p build \ + && cd build \ + && cmake -DMMDEPLOY_TARGET_BACKENDS="ort;trt" -DONNXRUNTIME_DIR=${ONNXRUNTIME_DIR} -DTENSORRT_DIR=${TENSORRT_DIR} -DCUDNN_DIR=${CUDNN_DIR} .. \ + && make -j$(nproc) \ + && make install \ + && cd .. \ + && mim install --no-cache-dir -e . + +# Fix undefined symbol bug + RUN echo -e "\nexport LD_LIBRARY_PATH=${ONNXRUNTIME_DIR}/lib:${TENSORRT_DIR}/lib:${CUDNN_DIR}/lib64:${LD_LIBRARY_PATH}\nldconfig" >> /root/.bashrc diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..13ce4e8a2b21ce22adffe37d77c5f374f6f0a008 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/__init__.py @@ -0,0 +1,39 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import mmdet +import mmengine +from mmengine.utils import digit_version + +from .version import __version__, version_info + +mmcv_minimum_version = '2.0.0rc4' +mmcv_maximum_version = '2.1.0' +mmcv_version = digit_version(mmcv.__version__) + +mmengine_minimum_version = '0.7.1' +mmengine_maximum_version = '1.0.0' +mmengine_version = digit_version(mmengine.__version__) + +mmdet_minimum_version = '3.0.0' +mmdet_maximum_version = '4.0.0' +mmdet_version = digit_version(mmdet.__version__) + + +assert (mmcv_version >= digit_version(mmcv_minimum_version) + and mmcv_version <= digit_version(mmcv_maximum_version)), \ + f'MMCV=={mmcv.__version__} is used but incompatible. ' \ + f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.' + +assert (mmengine_version >= digit_version(mmengine_minimum_version) + and mmengine_version < digit_version(mmengine_maximum_version)), \ + f'MMEngine=={mmengine.__version__} is used but incompatible. ' \ + f'Please install mmengine>={mmengine_minimum_version}, ' \ + f'<{mmengine_maximum_version}.' + +assert (mmdet_version >= digit_version(mmdet_minimum_version) + and mmdet_version < digit_version(mmdet_maximum_version)), \ + f'MMDetection=={mmdet.__version__} is used but incompatible. ' \ + f'Please install mmdet>={mmdet_minimum_version}, ' \ + f'<{mmdet_maximum_version}.' + +__all__ = ['__version__', 'version_info', 'digit_version'] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9db4390457119feaf13b1d2279c8c8bdf2abcf71 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .pose_coco import PoseCocoDataset +from .transforms import * # noqa: F401,F403 +from .utils import BatchShapePolicy, yolov5_collate +from .yolov5_coco import YOLOv5CocoDataset +from .yolov5_crowdhuman import YOLOv5CrowdHumanDataset +from .yolov5_dota import YOLOv5DOTADataset +from .yolov5_voc import YOLOv5VOCDataset + +__all__ = [ + 'YOLOv5CocoDataset', 'YOLOv5VOCDataset', 'BatchShapePolicy', + 'yolov5_collate', 'YOLOv5CrowdHumanDataset', 'YOLOv5DOTADataset', + 'PoseCocoDataset' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/pose_coco.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/pose_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b17f9836aea469f09679d01d605f3629771a1801 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/pose_coco.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + +from mmengine.dataset import force_full_init + +try: + from mmpose.datasets import CocoDataset as MMPoseCocoDataset +except ImportError: + MMPoseCocoDataset = object + +from ..registry import DATASETS + + +@DATASETS.register_module() +class PoseCocoDataset(MMPoseCocoDataset): + + METAINFO: dict = dict(from_file='configs/_base_/pose/coco.py') + + def __init__(self, *args, **kwargs): + if MMPoseCocoDataset is object: + raise ImportError( + 'Please run "mim install -r requirements/mmpose.txt" ' + 'to install mmpose first for PoseCocoDataset.') + super().__init__(*args, **kwargs) + + @force_full_init + def prepare_data(self, idx) -> Any: + data_info = self.get_data_info(idx) + data_info['dataset'] = self + return self.pipeline(data_info) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7cdcf8625173e05ef884cf1afe17a9a1c992b6cd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .formatting import PackDetInputs +from .mix_img_transforms import Mosaic, Mosaic9, YOLOv5MixUp, YOLOXMixUp +from .transforms import (FilterAnnotations, LetterResize, LoadAnnotations, + Polygon2Mask, PPYOLOERandomCrop, PPYOLOERandomDistort, + RandomAffine, RandomFlip, RegularizeRotatedBox, + RemoveDataElement, Resize, YOLOv5CopyPaste, + YOLOv5HSVRandomAug, YOLOv5KeepRatioResize, + YOLOv5RandomAffine) + +__all__ = [ + 'YOLOv5KeepRatioResize', 'LetterResize', 'Mosaic', 'YOLOXMixUp', + 'YOLOv5MixUp', 'YOLOv5HSVRandomAug', 'LoadAnnotations', + 'YOLOv5RandomAffine', 'PPYOLOERandomDistort', 'PPYOLOERandomCrop', + 'Mosaic9', 'YOLOv5CopyPaste', 'RemoveDataElement', 'RegularizeRotatedBox', + 'Polygon2Mask', 'PackDetInputs', 'RandomAffine', 'RandomFlip', 'Resize', + 'FilterAnnotations' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/formatting.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/formatting.py new file mode 100644 index 0000000000000000000000000000000000000000..07eb0121eefdeece052695eeb46599a71a62efe3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/formatting.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from mmcv.transforms import to_tensor +from mmdet.datasets.transforms import PackDetInputs as MMDET_PackDetInputs +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import BaseBoxes +from mmengine.structures import InstanceData, PixelData + +from mmyolo.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class PackDetInputs(MMDET_PackDetInputs): + """Pack the inputs data for the detection / semantic segmentation / + panoptic segmentation. + + Compared to mmdet, we just add the `gt_panoptic_seg` field and logic. + """ + mapping_table = { + 'gt_bboxes': 'bboxes', + 'gt_bboxes_labels': 'labels', + 'gt_masks': 'masks', + 'gt_keypoints': 'keypoints', + 'gt_keypoints_visible': 'keypoints_visible' + } + + def transform(self, results: dict) -> dict: + """Method to pack the input data. + Args: + results (dict): Result dict from the data pipeline. + Returns: + dict: + - 'inputs' (obj:`torch.Tensor`): The forward data of models. + - 'data_sample' (obj:`DetDataSample`): The annotation info of the + sample. + """ + packed_results = dict() + if 'img' in results: + img = results['img'] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + # To improve the computational speed by by 3-5 times, apply: + # If image is not contiguous, use + # `numpy.transpose()` followed by `numpy.ascontiguousarray()` + # If image is already contiguous, use + # `torch.permute()` followed by `torch.contiguous()` + # Refer to https://github.com/open-mmlab/mmdetection/pull/9533 + # for more details + if not img.flags.c_contiguous: + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + img = to_tensor(img) + else: + img = to_tensor(img).permute(2, 0, 1).contiguous() + + packed_results['inputs'] = img + + if 'gt_ignore_flags' in results: + valid_idx = np.where(results['gt_ignore_flags'] == 0)[0] + ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0] + if 'gt_keypoints' in results: + results['gt_keypoints_visible'] = results[ + 'gt_keypoints'].keypoints_visible + results['gt_keypoints'] = results['gt_keypoints'].keypoints + + data_sample = DetDataSample() + instance_data = InstanceData() + ignore_instance_data = InstanceData() + + for key in self.mapping_table.keys(): + if key not in results: + continue + if key == 'gt_masks' or isinstance(results[key], BaseBoxes): + if 'gt_ignore_flags' in results: + instance_data[ + self.mapping_table[key]] = results[key][valid_idx] + ignore_instance_data[ + self.mapping_table[key]] = results[key][ignore_idx] + else: + instance_data[self.mapping_table[key]] = results[key] + else: + if 'gt_ignore_flags' in results: + instance_data[self.mapping_table[key]] = to_tensor( + results[key][valid_idx]) + ignore_instance_data[self.mapping_table[key]] = to_tensor( + results[key][ignore_idx]) + else: + instance_data[self.mapping_table[key]] = to_tensor( + results[key]) + data_sample.gt_instances = instance_data + data_sample.ignored_instances = ignore_instance_data + + if 'gt_seg_map' in results: + gt_sem_seg_data = dict( + sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy())) + data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data) + + # In order to unify the support for the overlap mask annotations + # i.e. mask overlap annotations in (h,w) format, + # we use the gt_panoptic_seg field to unify the modeling + if 'gt_panoptic_seg' in results: + data_sample.gt_panoptic_seg = PixelData( + pan_seg=results['gt_panoptic_seg']) + + img_meta = {} + for key in self.meta_keys: + assert key in results, f'`{key}` is not found in `results`, ' \ + f'the valid keys are {list(results)}.' + img_meta[key] = results[key] + + data_sample.set_metainfo(img_meta) + packed_results['data_samples'] = data_sample + + return packed_results diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/keypoint_structure.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/keypoint_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..7b8402be9950bc2a635f5269e7959719e8d87ac9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/keypoint_structure.py @@ -0,0 +1,248 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta +from copy import deepcopy +from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Union + +import numpy as np +import torch +from torch import Tensor + +DeviceType = Union[str, torch.device] +T = TypeVar('T') +IndexType = Union[slice, int, list, torch.LongTensor, torch.cuda.LongTensor, + torch.BoolTensor, torch.cuda.BoolTensor, np.ndarray] + + +class Keypoints(metaclass=ABCMeta): + """The Keypoints class is for keypoints representation. + + Args: + keypoints (Tensor or np.ndarray): The keypoint data with shape of + (N, K, 2). + keypoints_visible (Tensor or np.ndarray): The visibility of keypoints + with shape of (N, K). + device (str or torch.device, Optional): device of keypoints. + Default to None. + clone (bool): Whether clone ``keypoints`` or not. Defaults to True. + flip_indices (list, Optional): The indices of keypoints when the + images is flipped. Defaults to None. + + Notes: + N: the number of instances. + K: the number of keypoints. + """ + + def __init__(self, + keypoints: Union[Tensor, np.ndarray], + keypoints_visible: Union[Tensor, np.ndarray], + device: Optional[DeviceType] = None, + clone: bool = True, + flip_indices: Optional[List] = None) -> None: + + assert len(keypoints_visible) == len(keypoints) + assert keypoints.ndim == 3 + assert keypoints_visible.ndim == 2 + + keypoints = torch.as_tensor(keypoints) + keypoints_visible = torch.as_tensor(keypoints_visible) + + if device is not None: + keypoints = keypoints.to(device=device) + keypoints_visible = keypoints_visible.to(device=device) + + if clone: + keypoints = keypoints.clone() + keypoints_visible = keypoints_visible.clone() + + self.keypoints = keypoints + self.keypoints_visible = keypoints_visible + self.flip_indices = flip_indices + + def flip_(self, + img_shape: Tuple[int, int], + direction: str = 'horizontal') -> None: + """Flip boxes & kpts horizontally in-place. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + direction (str): Flip direction, options are "horizontal", + "vertical" and "diagonal". Defaults to "horizontal" + """ + assert direction == 'horizontal' + self.keypoints[..., 0] = img_shape[1] - self.keypoints[..., 0] + self.keypoints = self.keypoints[:, self.flip_indices] + self.keypoints_visible = self.keypoints_visible[:, self.flip_indices] + + def translate_(self, distances: Tuple[float, float]) -> None: + """Translate boxes and keypoints in-place. + + Args: + distances (Tuple[float, float]): translate distances. The first + is horizontal distance and the second is vertical distance. + """ + assert len(distances) == 2 + distances = self.keypoints.new_tensor(distances).reshape(1, 1, 2) + self.keypoints = self.keypoints + distances + + def rescale_(self, scale_factor: Tuple[float, float]) -> None: + """Rescale boxes & keypoints w.r.t. rescale_factor in-place. + + Note: + Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes + w.r.t ``scale_facotr``. The difference is that ``resize_`` only + changes the width and the height of boxes, but ``rescale_`` also + rescales the box centers simultaneously. + + Args: + scale_factor (Tuple[float, float]): factors for scaling boxes. + The length should be 2. + """ + assert len(scale_factor) == 2 + + scale_factor = self.keypoints.new_tensor(scale_factor).reshape(1, 1, 2) + self.keypoints = self.keypoints * scale_factor + + def clip_(self, img_shape: Tuple[int, int]) -> None: + """Clip bounding boxes and set invisible keypoints outside the image + boundary in-place. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + """ + + kpt_outside = torch.logical_or( + torch.logical_or(self.keypoints[..., 0] < 0, + self.keypoints[..., 1] < 0), + torch.logical_or(self.keypoints[..., 0] > img_shape[1], + self.keypoints[..., 1] > img_shape[0])) + self.keypoints_visible[kpt_outside] *= 0 + + def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None: + """Geometrically transform bounding boxes and keypoints in-place using + a homography matrix. + + Args: + homography_matrix (Tensor or np.ndarray): A 3x3 tensor or ndarray + representing the homography matrix for the transformation. + """ + keypoints = self.keypoints + if isinstance(homography_matrix, np.ndarray): + homography_matrix = keypoints.new_tensor(homography_matrix) + + # Convert keypoints to homogeneous coordinates + keypoints = torch.cat([ + self.keypoints, + self.keypoints.new_ones(*self.keypoints.shape[:-1], 1) + ], + dim=-1) + + # Transpose keypoints for matrix multiplication + keypoints_T = torch.transpose(keypoints, -1, 0).contiguous().flatten(1) + + # Apply homography matrix to corners and keypoints + keypoints_T = torch.matmul(homography_matrix, keypoints_T) + + # Transpose back to original shape + keypoints_T = keypoints_T.reshape(3, self.keypoints.shape[1], -1) + keypoints = torch.transpose(keypoints_T, -1, 0).contiguous() + + # Convert corners and keypoints back to non-homogeneous coordinates + keypoints = keypoints[..., :2] / keypoints[..., 2:3] + + # Convert corners back to bounding boxes and update object attributes + self.keypoints = keypoints + + @classmethod + def cat(cls: Type[T], kps_list: Sequence[T], dim: int = 0) -> T: + """Cancatenates an instance list into one single instance. Similar to + ``torch.cat``. + + Args: + box_list (Sequence[T]): A sequence of instances. + dim (int): The dimension over which the box and keypoint are + concatenated. Defaults to 0. + + Returns: + T: Concatenated instance. + """ + assert isinstance(kps_list, Sequence) + if len(kps_list) == 0: + raise ValueError('kps_list should not be a empty list.') + + assert dim == 0 + assert all(isinstance(keypoints, cls) for keypoints in kps_list) + + th_kpt_list = torch.cat( + [keypoints.keypoints for keypoints in kps_list], dim=dim) + th_kpt_vis_list = torch.cat( + [keypoints.keypoints_visible for keypoints in kps_list], dim=dim) + flip_indices = kps_list[0].flip_indices + return cls( + th_kpt_list, + th_kpt_vis_list, + clone=False, + flip_indices=flip_indices) + + def __getitem__(self: T, index: IndexType) -> T: + """Rewrite getitem to protect the last dimension shape.""" + if isinstance(index, np.ndarray): + index = torch.as_tensor(index, device=self.device) + if isinstance(index, Tensor) and index.dtype == torch.bool: + assert index.dim() < self.keypoints.dim() - 1 + elif isinstance(index, tuple): + assert len(index) < self.keypoints.dim() - 1 + # `Ellipsis`(...) is commonly used in index like [None, ...]. + # When `Ellipsis` is in index, it must be the last item. + if Ellipsis in index: + assert index[-1] is Ellipsis + + keypoints = self.keypoints[index] + keypoints_visible = self.keypoints_visible[index] + if self.keypoints.dim() == 2: + keypoints = keypoints.reshape(1, -1, 2) + keypoints_visible = keypoints_visible.reshape(1, -1) + return type(self)( + keypoints, + keypoints_visible, + flip_indices=self.flip_indices, + clone=False) + + def __repr__(self) -> str: + """Return a strings that describes the object.""" + return self.__class__.__name__ + '(\n' + str(self.keypoints) + ')' + + @property + def num_keypoints(self) -> Tensor: + """Compute the number of visible keypoints for each object.""" + return self.keypoints_visible.sum(dim=1).int() + + def __deepcopy__(self, memo): + """Only clone the tensors when applying deepcopy.""" + cls = self.__class__ + other = cls.__new__(cls) + memo[id(self)] = other + other.keypoints = self.keypoints.clone() + other.keypoints_visible = self.keypoints_visible.clone() + other.flip_indices = deepcopy(self.flip_indices) + return other + + def clone(self: T) -> T: + """Reload ``clone`` for tensors.""" + return type(self)( + self.keypoints, + self.keypoints_visible, + flip_indices=self.flip_indices, + clone=True) + + def to(self: T, *args, **kwargs) -> T: + """Reload ``to`` for tensors.""" + return type(self)( + self.keypoints.to(*args, **kwargs), + self.keypoints_visible.to(*args, **kwargs), + flip_indices=self.flip_indices, + clone=False) + + @property + def device(self) -> torch.device: + """Reload ``device`` from self.tensor.""" + return self.keypoints.device diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/mix_img_transforms.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/mix_img_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..29e4a4057366374dbdd72fa106b5a3f7ac484d24 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/mix_img_transforms.py @@ -0,0 +1,1191 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import collections +import copy +from abc import ABCMeta, abstractmethod +from typing import Optional, Sequence, Tuple, Union + +import mmcv +import numpy as np +from mmcv.transforms import BaseTransform +from mmdet.structures.bbox import autocast_box_type +from mmengine.dataset import BaseDataset +from mmengine.dataset.base_dataset import Compose +from numpy import random + +from mmyolo.registry import TRANSFORMS + + +class BaseMixImageTransform(BaseTransform, metaclass=ABCMeta): + """A Base Transform of multiple images mixed. + + Suitable for training on multiple images mixed data augmentation like + mosaic and mixup. + + Cached mosaic transform will random select images from the cache + and combine them into one output image if use_cached is True. + + Args: + pre_transform(Sequence[str]): Sequence of transform object or + config dict to be composed. Defaults to None. + prob(float): The transformation probability. Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + pre_transform: Optional[Sequence[str]] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + + self.max_refetch = max_refetch + self.prob = prob + + self.use_cached = use_cached + self.max_cached_images = max_cached_images + self.random_pop = random_pop + self.results_cache = [] + + if pre_transform is None: + self.pre_transform = None + else: + self.pre_transform = Compose(pre_transform) + + @abstractmethod + def get_indexes(self, dataset: Union[BaseDataset, + list]) -> Union[list, int]: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list or int: indexes. + """ + pass + + @abstractmethod + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + pass + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Data augmentation function. + + The transform steps are as follows: + 1. Randomly generate index list of other images. + 2. Before Mosaic or MixUp need to go through the necessary + pre_transform, such as MixUp' pre_transform pipeline + include: 'LoadImageFromFile','LoadAnnotations', + 'Mosaic' and 'RandomAffine'. + 3. Use mix_img_transform function to implement specific + mix operations. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + + if random.uniform(0, 1) > self.prob: + return results + + if self.use_cached: + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + self.results_cache.append(copy.deepcopy(results)) + if len(self.results_cache) > self.max_cached_images: + if self.random_pop: + index = random.randint(0, len(self.results_cache) - 1) + else: + index = 0 + self.results_cache.pop(index) + + if len(self.results_cache) <= 4: + return results + else: + assert 'dataset' in results + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + + for _ in range(self.max_refetch): + # get index of one or three other images + if self.use_cached: + indexes = self.get_indexes(self.results_cache) + else: + indexes = self.get_indexes(dataset) + + if not isinstance(indexes, collections.abc.Sequence): + indexes = [indexes] + + if self.use_cached: + mix_results = [ + copy.deepcopy(self.results_cache[i]) for i in indexes + ] + else: + # get images information will be used for Mosaic or MixUp + mix_results = [ + copy.deepcopy(dataset.get_data_info(index)) + for index in indexes + ] + + if self.pre_transform is not None: + for i, data in enumerate(mix_results): + # pre_transform may also require dataset + data.update({'dataset': dataset}) + # before Mosaic or MixUp need to go through + # the necessary pre_transform + _results = self.pre_transform(data) + _results.pop('dataset') + mix_results[i] = _results + + if None not in mix_results: + results['mix_results'] = mix_results + break + print('Repeated calculation') + else: + raise RuntimeError( + 'The loading pipeline of the original dataset' + ' always return None. Please check the correctness ' + 'of the dataset and its pipeline.') + + # Mosaic or MixUp + results = self.mix_img_transform(results) + + if 'mix_results' in results: + results.pop('mix_results') + results['dataset'] = dataset + + return results + + +@TRANSFORMS.register_module() +class Mosaic(BaseMixImageTransform): + """Mosaic augmentation. + + Given 4 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + mosaic transform + center_x + +------------------------------+ + | pad | | + | +-----------+ pad | + | | | | + | | image1 +-----------+ + | | | | + | | | image2 | + center_y |----+-+-----------+-----------+ + | | cropped | | + |pad | image3 | image4 | + | | | | + +----|-------------+-----------+ + | | + +-------------+ + + The mosaic transform steps are as follows: + + 1. Choose the mosaic center as the intersections of 4 images + 2. Get the left top image according to the index, and randomly + sample another 3 images from the custom dataset. + 3. Sub image will be cropped if image is larger than mosaic patch + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + center_ratio_range (Sequence[float]): Center ratio range of mosaic + output. Defaults to (0.5, 1.5). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 4, 'The length of cache must >= 4, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.center_ratio_range = center_ratio_range + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(3)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + mosaic_masks = [] + mosaic_kps = [] + with_mask = True if 'gt_masks' in results else False + with_kps = True if 'gt_keypoints' in results else False + # self.img_scale is wh format + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 2), int(img_scale_w * 2), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2)), + self.pad_val, + dtype=results['img'].dtype) + + # mosaic center x, y + center_x = int(random.uniform(*self.center_ratio_range) * img_scale_w) + center_y = int(random.uniform(*self.center_ratio_range) * img_scale_h) + center_position = (center_x, center_y) + + loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') + for i, loc in enumerate(loc_strs): + if loc == 'top_left': + results_patch = results + else: + results_patch = results['mix_results'][i - 1] + + img_i = results_patch['img'] + h_i, w_i = img_i.shape[:2] + # keep_ratio resize + scale_ratio_i = min(img_scale_h / h_i, img_scale_w / w_i) + img_i = mmcv.imresize( + img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) + + # compute the combine parameters + paste_coord, crop_coord = self._mosaic_combine( + loc, center_position, img_i.shape[:2][::-1]) + x1_p, y1_p, x2_p, y2_p = paste_coord + x1_c, y1_c, x2_c, y2_c = crop_coord + + # crop and paste image + mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c] + + # adjust coordinate + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + + padw = x1_p - x1_c + padh = y1_p - y1_c + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + if with_mask and results_patch.get('gt_masks', None) is not None: + gt_masks_i = results_patch['gt_masks'] + gt_masks_i = gt_masks_i.resize(img_i.shape[:2]) + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padw, + direction='horizontal') + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padh, + direction='vertical') + mosaic_masks.append(gt_masks_i) + if with_kps and results_patch.get('gt_keypoints', + None) is not None: + gt_kps_i = results_patch['gt_keypoints'] + gt_kps_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_kps_i.translate_([padw, padh]) + mosaic_kps.append(gt_kps_i) + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks) + results['gt_masks'] = mosaic_masks + if with_kps: + mosaic_kps = mosaic_kps[0].cat(mosaic_kps, 0) + mosaic_kps.clip_([2 * img_scale_h, 2 * img_scale_w]) + results['gt_keypoints'] = mosaic_kps + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks)[inside_inds] + results['gt_masks'] = mosaic_masks + if with_kps: + mosaic_kps = mosaic_kps[0].cat(mosaic_kps, 0) + mosaic_kps = mosaic_kps[inside_inds] + results['gt_keypoints'] = mosaic_kps + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + + return results + + def _mosaic_combine( + self, loc: str, center_position_xy: Sequence[float], + img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]: + """Calculate global coordinate of mosaic image and local coordinate of + cropped sub-image. + + Args: + loc (str): Index for the sub-image, loc in ('top_left', + 'top_right', 'bottom_left', 'bottom_right'). + center_position_xy (Sequence[float]): Mixing center for 4 images, + (x, y). + img_shape_wh (Sequence[int]): Width and height of sub-image + + Returns: + tuple[tuple[float]]: Corresponding coordinate of pasting and + cropping + - paste_coord (tuple): paste corner coordinate in mosaic image. + - crop_coord (tuple): crop corner coordinate in mosaic image. + """ + assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right') + if loc == 'top_left': + # index0 to top left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + center_position_xy[0], \ + center_position_xy[1] + crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - ( + y2 - y1), img_shape_wh[0], img_shape_wh[1] + + elif loc == 'top_right': + # index1 to top right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + center_position_xy[1] + crop_coord = 0, img_shape_wh[1] - (y2 - y1), min( + img_shape_wh[0], x2 - x1), img_shape_wh[1] + + elif loc == 'bottom_left': + # index2 to bottom left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + center_position_xy[1], \ + center_position_xy[0], \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min( + y2 - y1, img_shape_wh[1]) + + else: + # index3 to bottom right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + center_position_xy[1], \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = 0, 0, min(img_shape_wh[0], + x2 - x1), min(y2 - y1, img_shape_wh[1]) + + paste_coord = x1, y1, x2, y2 + return paste_coord, crop_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'center_ratio_range={self.center_ratio_range}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class Mosaic9(BaseMixImageTransform): + """Mosaic9 augmentation. + + Given 9 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + +-------------------------------+------------+ + | pad | pad | | + | +----------+ | | + | | +---------------+ top_right | + | | | top | image2 | + | | top_left | image1 | | + | | image8 o--------+------+--------+---+ + | | | | | | + +----+----------+ | right |pad| + | | center | image3 | | + | left | image0 +---------------+---| + | image7 | | | | + +---+-----------+---+--------+ | | + | | cropped | | bottom_right |pad| + | |bottom_left| | image4 | | + | | image6 | bottom | | | + +---|-----------+ image5 +---------------+---| + | pad | | pad | + +-----------+------------+-------------------+ + + The mosaic transform steps are as follows: + + 1. Get the center image according to the index, and randomly + sample another 8 images from the custom dataset. + 2. Randomly offset the image after Mosaic + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 5 caches for each image suffices for + randomness. Defaults to 50. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + bbox_clip_border: bool = True, + pad_val: Union[float, int] = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 50, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 9, 'The length of cache must >= 9, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + # intermediate variables + self._current_img_shape = [0, 0] + self._center_img_shape = [0, 0] + self._previous_img_shape = [0, 0] + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(8)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 3), int(img_scale_w * 3), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 3), int(img_scale_w * 3)), + self.pad_val, + dtype=results['img'].dtype) + + # index = 0 is mean original image + # len(results['mix_results']) = 8 + loc_strs = ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + results_all = [results, *results['mix_results']] + for index, results_patch in enumerate(results_all): + img_i = results_patch['img'] + # keep_ratio resize + img_i_h, img_i_w = img_i.shape[:2] + scale_ratio_i = min(img_scale_h / img_i_h, img_scale_w / img_i_w) + img_i = mmcv.imresize( + img_i, + (int(img_i_w * scale_ratio_i), int(img_i_h * scale_ratio_i))) + + paste_coord = self._mosaic_combine(loc_strs[index], + img_i.shape[:2]) + + padw, padh = paste_coord[:2] + x1, y1, x2, y2 = (max(x, 0) for x in paste_coord) + mosaic_img[y1:y2, x1:x2] = img_i[y1 - padh:, x1 - padw:] + + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + + # Offset + offset_x = int(random.uniform(0, img_scale_w)) + offset_y = int(random.uniform(0, img_scale_h)) + mosaic_img = mosaic_img[offset_y:offset_y + 2 * img_scale_h, + offset_x:offset_x + 2 * img_scale_w] + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes.translate_([-offset_x, -offset_y]) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + return results + + def _mosaic_combine(self, loc: str, + img_shape_hw: Tuple[int, int]) -> Tuple[int, ...]: + """Calculate global coordinate of mosaic image. + + Args: + loc (str): Index for the sub-image. + img_shape_hw (Sequence[int]): Height and width of sub-image + + Returns: + paste_coord (tuple): paste corner coordinate in mosaic image. + """ + assert loc in ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + img_scale_w, img_scale_h = self.img_scale + + self._current_img_shape = img_shape_hw + current_img_h, current_img_w = self._current_img_shape + previous_img_h, previous_img_w = self._previous_img_shape + center_img_h, center_img_w = self._center_img_shape + + if loc == 'center': + self._center_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + paste_coord = img_scale_w, \ + img_scale_h, \ + img_scale_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'top': + paste_coord = img_scale_w, \ + img_scale_h - current_img_h, \ + img_scale_w + current_img_w, \ + img_scale_h + elif loc == 'top_right': + paste_coord = img_scale_w + previous_img_w, \ + img_scale_h - current_img_h, \ + img_scale_w + previous_img_w + current_img_w, \ + img_scale_h + elif loc == 'right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'bottom_right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h + previous_img_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + previous_img_h + current_img_h + elif loc == 'bottom': + paste_coord = img_scale_w + center_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'bottom_left': + paste_coord = img_scale_w + center_img_w - \ + previous_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w - previous_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h + elif loc == 'top_left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - \ + previous_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h - previous_img_h + + self._previous_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + return paste_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5MixUp(BaseMixImageTransform): + """MixUp data augmentation for YOLOv5. + + .. code:: text + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset. + 2. Randomly obtain the fusion ratio from the beta distribution, + then fuse the target + of the original image and mixup image through this ratio. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + alpha (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + beta (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + pre_transform (Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + alpha: float = 32.0, + beta: float = 32.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.alpha = alpha + self.beta = beta + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOv5 MixUp transform function. + + Args: + results (dict): Result dict + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + ori_img = results['img'] + assert ori_img.shape == retrieve_img.shape + + # Randomly obtain the fusion ratio from the beta distribution, + # which is around 0.5 + ratio = np.random.beta(self.alpha, self.beta) + mixup_img = (ori_img * ratio + retrieve_img * (1 - ratio)) + + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = retrieve_gt_bboxes.cat( + (results['gt_bboxes'], retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + if 'gt_masks' in results: + assert 'gt_masks' in retrieve_results + mixup_gt_masks = results['gt_masks'].cat( + [results['gt_masks'], retrieve_results['gt_masks']]) + results['gt_masks'] = mixup_gt_masks + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + +@TRANSFORMS.register_module() +class YOLOXMixUp(BaseMixImageTransform): + """MixUp data augmentation for YOLOX. + + .. code:: text + + mixup transform + +---------------+--------------+ + | mixup image | | + | +--------|--------+ | + | | | | | + +---------------+ | | + | | | | + | | image | | + | | | | + | | | | + | +-----------------+ | + | pad | + +------------------------------+ + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset and embedded in + the top left patch(after padding and resizing) + 2. The target of mixup transform is the weighted average of mixup + image and origin image. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + img_scale (Sequence[int]): Image output size after mixup pipeline. + The shape order should be (width, height). Defaults to (640, 640). + ratio_range (Sequence[float]): Scale ratio of mixup image. + Defaults to (0.5, 1.5). + flip_ratio (float): Horizontal flip ratio of mixup image. + Defaults to 0.5. + pad_val (int): Pad value. Defaults to 114. + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + ratio_range: Tuple[float, float] = (0.5, 1.5), + flip_ratio: float = 0.5, + pad_val: float = 114.0, + bbox_clip_border: bool = True, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.img_scale = img_scale + self.ratio_range = ratio_range + self.flip_ratio = flip_ratio + self.pad_val = pad_val + self.bbox_clip_border = bbox_clip_border + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOX MixUp transform function. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + assert len( + results['mix_results']) == 1, 'MixUp only support 2 images now !' + + if results['mix_results'][0]['gt_bboxes'].shape[0] == 0: + # empty bbox + return results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + + jit_factor = random.uniform(*self.ratio_range) + is_filp = random.uniform(0, 1) > self.flip_ratio + + if len(retrieve_img.shape) == 3: + out_img = np.ones((self.img_scale[1], self.img_scale[0], 3), + dtype=retrieve_img.dtype) * self.pad_val + else: + out_img = np.ones( + self.img_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val + + # 1. keep_ratio resize + scale_ratio = min(self.img_scale[1] / retrieve_img.shape[0], + self.img_scale[0] / retrieve_img.shape[1]) + retrieve_img = mmcv.imresize( + retrieve_img, (int(retrieve_img.shape[1] * scale_ratio), + int(retrieve_img.shape[0] * scale_ratio))) + + # 2. paste + out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img + + # 3. scale jit + scale_ratio *= jit_factor + out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), + int(out_img.shape[0] * jit_factor))) + + # 4. flip + if is_filp: + out_img = out_img[:, ::-1, :] + + # 5. random crop + ori_img = results['img'] + origin_h, origin_w = out_img.shape[:2] + target_h, target_w = ori_img.shape[:2] + padded_img = np.ones((max(origin_h, target_h), max( + origin_w, target_w), 3)) * self.pad_val + padded_img = padded_img.astype(np.uint8) + padded_img[:origin_h, :origin_w] = out_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w) + padded_cropped_img = padded_img[y_offset:y_offset + target_h, + x_offset:x_offset + target_w] + + # 6. adjust bbox + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio]) + if self.bbox_clip_border: + retrieve_gt_bboxes.clip_([origin_h, origin_w]) + + if is_filp: + retrieve_gt_bboxes.flip_([origin_h, origin_w], + direction='horizontal') + + # 7. filter + cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone() + cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset]) + if self.bbox_clip_border: + cp_retrieve_gt_bboxes.clip_([target_h, target_w]) + + # 8. mix up + mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img + + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat( + (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + + if not self.bbox_clip_border: + # remove outside bbox + inside_inds = mixup_gt_bboxes.is_inside([target_h, + target_w]).numpy() + mixup_gt_bboxes = mixup_gt_bboxes[inside_inds] + mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds] + mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds] + + if 'gt_keypoints' in results: + # adjust kps + retrieve_gt_keypoints = retrieve_results['gt_keypoints'] + retrieve_gt_keypoints.rescale_([scale_ratio, scale_ratio]) + if self.bbox_clip_border: + retrieve_gt_keypoints.clip_([origin_h, origin_w]) + + if is_filp: + retrieve_gt_keypoints.flip_([origin_h, origin_w], + direction='horizontal') + + # filter + cp_retrieve_gt_keypoints = retrieve_gt_keypoints.clone() + cp_retrieve_gt_keypoints.translate_([-x_offset, -y_offset]) + if self.bbox_clip_border: + cp_retrieve_gt_keypoints.clip_([target_h, target_w]) + + # mixup + mixup_gt_keypoints = cp_retrieve_gt_keypoints.cat( + (results['gt_keypoints'], cp_retrieve_gt_keypoints), dim=0) + if not self.bbox_clip_border: + # remove outside bbox + mixup_gt_keypoints = mixup_gt_keypoints[inside_inds] + results['gt_keypoints'] = mixup_gt_keypoints + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'flip_ratio={self.flip_ratio}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'max_refetch={self.max_refetch}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/transforms.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..8060e9c727b95ba4cfcef865385f9e40491e26da --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/transforms/transforms.py @@ -0,0 +1,2102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from copy import deepcopy +from typing import List, Sequence, Tuple, Union + +import cv2 +import mmcv +import numpy as np +import torch +from mmcv.image.geometric import _scale_size +from mmcv.transforms import BaseTransform, Compose +from mmcv.transforms.utils import cache_randomness +from mmdet.datasets.transforms import FilterAnnotations as FilterDetAnnotations +from mmdet.datasets.transforms import LoadAnnotations as MMDET_LoadAnnotations +from mmdet.datasets.transforms import RandomAffine as MMDET_RandomAffine +from mmdet.datasets.transforms import RandomFlip as MMDET_RandomFlip +from mmdet.datasets.transforms import Resize as MMDET_Resize +from mmdet.structures.bbox import (HorizontalBoxes, autocast_box_type, + get_box_type) +from mmdet.structures.mask import PolygonMasks, polygon_to_bitmap +from numpy import random + +from mmyolo.registry import TRANSFORMS +from .keypoint_structure import Keypoints + +# TODO: Waiting for MMCV support +TRANSFORMS.register_module(module=Compose, force=True) + + +@TRANSFORMS.register_module() +class YOLOv5KeepRatioResize(MMDET_Resize): + """Resize images & bbox(if existed). + + This transform resizes the input image according to ``scale``. + Bboxes (if existed) are then resized with the same scale factor. + + Required Keys: + + - img (np.uint8) + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + + Modified Keys: + + - img (np.uint8) + - img_shape (tuple) + - gt_bboxes (optional) + - scale (float) + + Added Keys: + + - scale_factor (np.float32) + + Args: + scale (Union[int, Tuple[int, int]]): Images scales for resizing. + """ + + def __init__(self, + scale: Union[int, Tuple[int, int]], + keep_ratio: bool = True, + **kwargs): + assert keep_ratio is True + super().__init__(scale=scale, keep_ratio=True, **kwargs) + + @staticmethod + def _get_rescale_ratio(old_size: Tuple[int, int], + scale: Union[float, Tuple[int]]) -> float: + """Calculate the ratio for rescaling. + + Args: + old_size (tuple[int]): The old size (w, h) of image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by + this factor, else if it is a tuple of 2 integers, then + the image will be rescaled as large as possible within + the scale. + + Returns: + float: The resize ratio. + """ + w, h = old_size + if isinstance(scale, (float, int)): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + scale_factor = scale + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + else: + raise TypeError('Scale must be a number or tuple of int, ' + f'but got {type(scale)}') + + return scale_factor + + def _resize_img(self, results: dict): + """Resize images with ``results['scale']``.""" + assert self.keep_ratio is True + + if results.get('img', None) is not None: + image = results['img'] + original_h, original_w = image.shape[:2] + ratio = self._get_rescale_ratio((original_h, original_w), + self.scale) + + if ratio != 1: + # resize image according to the shape + # NOTE: We are currently testing on COCO that modifying + # this code will not affect the results. + # If you find that it has an effect on your results, + # please feel free to contact us. + image = mmcv.imresize( + img=image, + size=(int(original_w * ratio), int(original_h * ratio)), + interpolation='area' if ratio < 1 else 'bilinear', + backend=self.backend) + + resized_h, resized_w = image.shape[:2] + scale_ratio_h = resized_h / original_h + scale_ratio_w = resized_w / original_w + scale_factor = (scale_ratio_w, scale_ratio_h) + + results['img'] = image + results['img_shape'] = image.shape[:2] + results['scale_factor'] = scale_factor + + +@TRANSFORMS.register_module() +class LetterResize(MMDET_Resize): + """Resize and pad image while meeting stride-multiple constraints. + + Required Keys: + + - img (np.uint8) + - batch_shape (np.int64) (optional) + + Modified Keys: + + - img (np.uint8) + - img_shape (tuple) + - gt_bboxes (optional) + + Added Keys: + - pad_param (np.float32) + + Args: + scale (Union[int, Tuple[int, int]]): Images scales for resizing. + pad_val (dict): Padding value. Defaults to dict(img=0, seg=255). + use_mini_pad (bool): Whether using minimum rectangle padding. + Defaults to True + stretch_only (bool): Whether stretch to the specified size directly. + Defaults to False + allow_scale_up (bool): Allow scale up when ratio > 1. Defaults to True + half_pad_param (bool): If set to True, left and right pad_param will + be given by dividing padding_h by 2. If set to False, pad_param is + in int format. We recommend setting this to False for object + detection tasks, and True for instance segmentation tasks. + Default to False. + """ + + def __init__(self, + scale: Union[int, Tuple[int, int]], + pad_val: dict = dict(img=0, mask=0, seg=255), + use_mini_pad: bool = False, + stretch_only: bool = False, + allow_scale_up: bool = True, + half_pad_param: bool = False, + **kwargs): + super().__init__(scale=scale, keep_ratio=True, **kwargs) + + self.pad_val = pad_val + if isinstance(pad_val, (int, float)): + pad_val = dict(img=pad_val, seg=255) + assert isinstance( + pad_val, dict), f'pad_val must be dict, but got {type(pad_val)}' + + self.use_mini_pad = use_mini_pad + self.stretch_only = stretch_only + self.allow_scale_up = allow_scale_up + self.half_pad_param = half_pad_param + + def _resize_img(self, results: dict): + """Resize images with ``results['scale']``.""" + image = results.get('img', None) + if image is None: + return + + # Use batch_shape if a batch_shape policy is configured + if 'batch_shape' in results: + scale = tuple(results['batch_shape']) # hw + else: + scale = self.scale[::-1] # wh -> hw + + image_shape = image.shape[:2] # height, width + + # Scale ratio (new / old) + ratio = min(scale[0] / image_shape[0], scale[1] / image_shape[1]) + + # only scale down, do not scale up (for better test mAP) + if not self.allow_scale_up: + ratio = min(ratio, 1.0) + + ratio = [ratio, ratio] # float -> (float, float) for (height, width) + + # compute the best size of the image + no_pad_shape = (int(round(image_shape[0] * ratio[0])), + int(round(image_shape[1] * ratio[1]))) + + # padding height & width + padding_h, padding_w = [ + scale[0] - no_pad_shape[0], scale[1] - no_pad_shape[1] + ] + if self.use_mini_pad: + # minimum rectangle padding + padding_w, padding_h = np.mod(padding_w, 32), np.mod(padding_h, 32) + + elif self.stretch_only: + # stretch to the specified size directly + padding_h, padding_w = 0.0, 0.0 + no_pad_shape = (scale[0], scale[1]) + ratio = [scale[0] / image_shape[0], + scale[1] / image_shape[1]] # height, width ratios + + if image_shape != no_pad_shape: + # compare with no resize and padding size + image = mmcv.imresize( + image, (no_pad_shape[1], no_pad_shape[0]), + interpolation=self.interpolation, + backend=self.backend) + + scale_factor = (no_pad_shape[1] / image_shape[1], + no_pad_shape[0] / image_shape[0]) + + if 'scale_factor' in results: + results['scale_factor_origin'] = results['scale_factor'] + results['scale_factor'] = scale_factor + + # padding + top_padding, left_padding = int(round(padding_h // 2 - 0.1)), int( + round(padding_w // 2 - 0.1)) + bottom_padding = padding_h - top_padding + right_padding = padding_w - left_padding + + padding_list = [ + top_padding, bottom_padding, left_padding, right_padding + ] + if top_padding != 0 or bottom_padding != 0 or \ + left_padding != 0 or right_padding != 0: + + pad_val = self.pad_val.get('img', 0) + if isinstance(pad_val, int) and image.ndim == 3: + pad_val = tuple(pad_val for _ in range(image.shape[2])) + + image = mmcv.impad( + img=image, + padding=(padding_list[2], padding_list[0], padding_list[3], + padding_list[1]), + pad_val=pad_val, + padding_mode='constant') + + results['img'] = image + results['img_shape'] = image.shape + if 'pad_param' in results: + results['pad_param_origin'] = results['pad_param'] * \ + np.repeat(ratio, 2) + + if self.half_pad_param: + results['pad_param'] = np.array( + [padding_h / 2, padding_h / 2, padding_w / 2, padding_w / 2], + dtype=np.float32) + else: + # We found in object detection, using padding list with + # int type can get higher mAP. + results['pad_param'] = np.array(padding_list, dtype=np.float32) + + def _resize_masks(self, results: dict): + """Resize masks with ``results['scale']``""" + if results.get('gt_masks', None) is None: + return + + gt_masks = results['gt_masks'] + assert isinstance( + gt_masks, PolygonMasks + ), f'Only supports PolygonMasks, but got {type(gt_masks)}' + + # resize the gt_masks + gt_mask_h = results['gt_masks'].height * results['scale_factor'][1] + gt_mask_w = results['gt_masks'].width * results['scale_factor'][0] + gt_masks = results['gt_masks'].resize( + (int(round(gt_mask_h)), int(round(gt_mask_w)))) + + top_padding, _, left_padding, _ = results['pad_param'] + if int(left_padding) != 0: + gt_masks = gt_masks.translate( + out_shape=results['img_shape'][:2], + offset=int(left_padding), + direction='horizontal') + if int(top_padding) != 0: + gt_masks = gt_masks.translate( + out_shape=results['img_shape'][:2], + offset=int(top_padding), + direction='vertical') + results['gt_masks'] = gt_masks + + def _resize_bboxes(self, results: dict): + """Resize bounding boxes with ``results['scale_factor']``.""" + if results.get('gt_bboxes', None) is None: + return + results['gt_bboxes'].rescale_(results['scale_factor']) + + if len(results['pad_param']) != 4: + return + results['gt_bboxes'].translate_( + (results['pad_param'][2], results['pad_param'][0])) + + if self.clip_object_border: + results['gt_bboxes'].clip_(results['img_shape']) + + def transform(self, results: dict) -> dict: + results = super().transform(results) + if 'scale_factor_origin' in results: + scale_factor_origin = results.pop('scale_factor_origin') + results['scale_factor'] = (results['scale_factor'][0] * + scale_factor_origin[0], + results['scale_factor'][1] * + scale_factor_origin[1]) + if 'pad_param_origin' in results: + pad_param_origin = results.pop('pad_param_origin') + results['pad_param'] += pad_param_origin + return results + + +# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug +@TRANSFORMS.register_module() +class YOLOv5HSVRandomAug(BaseTransform): + """Apply HSV augmentation to image sequentially. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + hue_delta ([int, float]): delta of hue. Defaults to 0.015. + saturation_delta ([int, float]): delta of saturation. Defaults to 0.7. + value_delta ([int, float]): delta of value. Defaults to 0.4. + """ + + def __init__(self, + hue_delta: Union[int, float] = 0.015, + saturation_delta: Union[int, float] = 0.7, + value_delta: Union[int, float] = 0.4): + self.hue_delta = hue_delta + self.saturation_delta = saturation_delta + self.value_delta = value_delta + + def transform(self, results: dict) -> dict: + """The HSV augmentation transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + hsv_gains = \ + random.uniform(-1, 1, 3) * \ + [self.hue_delta, self.saturation_delta, self.value_delta] + 1 + hue, sat, val = cv2.split( + cv2.cvtColor(results['img'], cv2.COLOR_BGR2HSV)) + + table_list = np.arange(0, 256, dtype=hsv_gains.dtype) + lut_hue = ((table_list * hsv_gains[0]) % 180).astype(np.uint8) + lut_sat = np.clip(table_list * hsv_gains[1], 0, 255).astype(np.uint8) + lut_val = np.clip(table_list * hsv_gains[2], 0, 255).astype(np.uint8) + + im_hsv = cv2.merge( + (cv2.LUT(hue, lut_hue), cv2.LUT(sat, + lut_sat), cv2.LUT(val, lut_val))) + results['img'] = cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(hue_delta={self.hue_delta}, ' + repr_str += f'saturation_delta={self.saturation_delta}, ' + repr_str += f'value_delta={self.value_delta})' + return repr_str + + +@TRANSFORMS.register_module() +class LoadAnnotations(MMDET_LoadAnnotations): + """Because the yolo series does not need to consider ignore bboxes for the + time being, in order to speed up the pipeline, it can be excluded in + advance. + + Args: + mask2bbox (bool): Whether to use mask annotation to get bbox. + Defaults to False. + poly2mask (bool): Whether to transform the polygons to bitmaps. + Defaults to False. + merge_polygons (bool): Whether to merge polygons into one polygon. + If merged, the storage structure is simpler and training is more + effcient, especially if the mask inside a bbox is divided into + multiple polygons. Defaults to True. + """ + + def __init__(self, + mask2bbox: bool = False, + poly2mask: bool = False, + merge_polygons: bool = True, + **kwargs): + self.mask2bbox = mask2bbox + self.merge_polygons = merge_polygons + assert not poly2mask, 'Does not support BitmapMasks considering ' \ + 'that bitmap consumes more memory.' + super().__init__(poly2mask=poly2mask, **kwargs) + if self.mask2bbox: + assert self.with_mask, 'Using mask2bbox requires ' \ + 'with_mask is True.' + self._mask_ignore_flag = None + + def transform(self, results: dict) -> dict: + """Function to load multiple types annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + + Returns: + dict: The dict contains loaded bounding box, label and + semantic segmentation. + """ + if self.mask2bbox: + self._load_masks(results) + if self.with_label: + self._load_labels(results) + self._update_mask_ignore_data(results) + gt_bboxes = results['gt_masks'].get_bboxes(dst_type='hbox') + results['gt_bboxes'] = gt_bboxes + elif self.with_keypoints: + self._load_kps(results) + _, box_type_cls = get_box_type(self.box_type) + results['gt_bboxes'] = box_type_cls( + results.get('bbox', []), dtype=torch.float32) + else: + results = super().transform(results) + self._update_mask_ignore_data(results) + return results + + def _update_mask_ignore_data(self, results: dict) -> None: + if 'gt_masks' not in results: + return + + if 'gt_bboxes_labels' in results and len( + results['gt_bboxes_labels']) != len(results['gt_masks']): + assert len(results['gt_bboxes_labels']) == len( + self._mask_ignore_flag) + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + self._mask_ignore_flag] + + if 'gt_bboxes' in results and len(results['gt_bboxes']) != len( + results['gt_masks']): + assert len(results['gt_bboxes']) == len(self._mask_ignore_flag) + results['gt_bboxes'] = results['gt_bboxes'][self._mask_ignore_flag] + + def _load_bboxes(self, results: dict): + """Private function to load bounding box annotations. + Note: BBoxes with ignore_flag of 1 is not considered. + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + + Returns: + dict: The dict contains loaded bounding box annotations. + """ + gt_bboxes = [] + gt_ignore_flags = [] + for instance in results.get('instances', []): + if instance['ignore_flag'] == 0: + gt_bboxes.append(instance['bbox']) + gt_ignore_flags.append(instance['ignore_flag']) + results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool) + + if self.box_type is None: + results['gt_bboxes'] = np.array( + gt_bboxes, dtype=np.float32).reshape((-1, 4)) + else: + _, box_type_cls = get_box_type(self.box_type) + results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32) + + def _load_labels(self, results: dict): + """Private function to load label annotations. + + Note: BBoxes with ignore_flag of 1 is not considered. + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + Returns: + dict: The dict contains loaded label annotations. + """ + gt_bboxes_labels = [] + for instance in results.get('instances', []): + if instance['ignore_flag'] == 0: + gt_bboxes_labels.append(instance['bbox_label']) + results['gt_bboxes_labels'] = np.array( + gt_bboxes_labels, dtype=np.int64) + + def _load_masks(self, results: dict) -> None: + """Private function to load mask annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + """ + gt_masks = [] + gt_ignore_flags = [] + self._mask_ignore_flag = [] + for instance in results.get('instances', []): + if instance['ignore_flag'] == 0: + if 'mask' in instance: + gt_mask = instance['mask'] + if isinstance(gt_mask, list): + gt_mask = [ + np.array(polygon) for polygon in gt_mask + if len(polygon) % 2 == 0 and len(polygon) >= 6 + ] + if len(gt_mask) == 0: + # ignore + self._mask_ignore_flag.append(0) + else: + if len(gt_mask) > 1 and self.merge_polygons: + gt_mask = self.merge_multi_segment(gt_mask) + gt_masks.append(gt_mask) + gt_ignore_flags.append(instance['ignore_flag']) + self._mask_ignore_flag.append(1) + else: + raise NotImplementedError( + 'Only supports mask annotations in polygon ' + 'format currently') + else: + # TODO: Actually, gt with bbox and without mask needs + # to be retained + self._mask_ignore_flag.append(0) + self._mask_ignore_flag = np.array(self._mask_ignore_flag, dtype=bool) + results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool) + + h, w = results['ori_shape'] + gt_masks = PolygonMasks([mask for mask in gt_masks], h, w) + results['gt_masks'] = gt_masks + + def merge_multi_segment(self, + gt_masks: List[np.ndarray]) -> List[np.ndarray]: + """Merge multi segments to one list. + + Find the coordinates with min distance between each segment, + then connect these coordinates with one thin line to merge all + segments into one. + Args: + gt_masks(List(np.array)): + original segmentations in coco's json file. + like [segmentation1, segmentation2,...], + each segmentation is a list of coordinates. + Return: + gt_masks(List(np.array)): merged gt_masks + """ + s = [] + segments = [np.array(i).reshape(-1, 2) for i in gt_masks] + idx_list = [[] for _ in range(len(gt_masks))] + + # record the indexes with min distance between each segment + for i in range(1, len(segments)): + idx1, idx2 = self.min_index(segments[i - 1], segments[i]) + idx_list[i - 1].append(idx1) + idx_list[i].append(idx2) + + # use two round to connect all the segments + # first round: first to end, i.e. A->B(partial)->C + # second round: end to first, i.e. C->B(remaining)-A + for k in range(2): + # forward first round + if k == 0: + for i, idx in enumerate(idx_list): + # middle segments have two indexes + # reverse the index of middle segments + if len(idx) == 2 and idx[0] > idx[1]: + idx = idx[::-1] + segments[i] = segments[i][::-1, :] + # add the idx[0] point for connect next segment + segments[i] = np.roll(segments[i], -idx[0], axis=0) + segments[i] = np.concatenate( + [segments[i], segments[i][:1]]) + # deal with the first segment and the last one + if i in [0, len(idx_list) - 1]: + s.append(segments[i]) + # deal with the middle segment + # Note that in the first round, only partial segment + # are appended. + else: + idx = [0, idx[1] - idx[0]] + s.append(segments[i][idx[0]:idx[1] + 1]) + # forward second round + else: + for i in range(len(idx_list) - 1, -1, -1): + # deal with the middle segment + # append the remaining points + if i not in [0, len(idx_list) - 1]: + idx = idx_list[i] + nidx = abs(idx[1] - idx[0]) + s.append(segments[i][nidx:]) + return [np.concatenate(s).reshape(-1, )] + + def min_index(self, arr1: np.ndarray, arr2: np.ndarray) -> Tuple[int, int]: + """Find a pair of indexes with the shortest distance. + + Args: + arr1: (N, 2). + arr2: (M, 2). + Return: + tuple: a pair of indexes. + """ + dis = ((arr1[:, None, :] - arr2[None, :, :])**2).sum(-1) + return np.unravel_index(np.argmin(dis, axis=None), dis.shape) + + def _load_kps(self, results: dict) -> None: + """Private function to load keypoints annotations. + + Args: + results (dict): Result dict from + :class:`mmengine.dataset.BaseDataset`. + + Returns: + dict: The dict contains loaded keypoints annotations. + """ + results['height'] = results['img_shape'][0] + results['width'] = results['img_shape'][1] + num_instances = len(results.get('bbox', [])) + + if num_instances == 0: + results['keypoints'] = np.empty( + (0, len(results['flip_indices']), 2), dtype=np.float32) + results['keypoints_visible'] = np.empty( + (0, len(results['flip_indices'])), dtype=np.int32) + results['category_id'] = [] + + results['gt_keypoints'] = Keypoints( + keypoints=results['keypoints'], + keypoints_visible=results['keypoints_visible'], + flip_indices=results['flip_indices'], + ) + + results['gt_ignore_flags'] = np.array([False] * num_instances) + results['gt_bboxes_labels'] = np.array(results['category_id']) - 1 + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(with_bbox={self.with_bbox}, ' + repr_str += f'with_label={self.with_label}, ' + repr_str += f'with_mask={self.with_mask}, ' + repr_str += f'with_seg={self.with_seg}, ' + repr_str += f'mask2bbox={self.mask2bbox}, ' + repr_str += f'poly2mask={self.poly2mask}, ' + repr_str += f"imdecode_backend='{self.imdecode_backend}', " + repr_str += f'backend_args={self.backend_args})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5RandomAffine(BaseTransform): + """Random affine transform data augmentation in YOLOv5 and YOLOv8. It is + different from the implementation in YOLOX. + + This operation randomly generates affine transform matrix which including + rotation, translation, shear and scaling transforms. + If you set use_mask_refine == True, the code will use the masks + annotation to refine the bbox. + Our implementation is slightly different from the official. In COCO + dataset, a gt may have multiple mask tags. The official YOLOv5 + annotation file already combines the masks that an object has, + but our code takes into account the fact that an object has multiple masks. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - gt_masks (PolygonMasks) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + - gt_masks (PolygonMasks) (optional) + + Args: + max_rotate_degree (float): Maximum degrees of rotation transform. + Defaults to 10. + max_translate_ratio (float): Maximum ratio of translation. + Defaults to 0.1. + scaling_ratio_range (tuple[float]): Min and max ratio of + scaling transform. Defaults to (0.5, 1.5). + max_shear_degree (float): Maximum degrees of shear + transform. Defaults to 2. + border (tuple[int]): Distance from width and height sides of input + image to adjust output shape. Only used in mosaic dataset. + Defaults to (0, 0). + border_val (tuple[int]): Border padding values of 3 channels. + Defaults to (114, 114, 114). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + min_bbox_size (float): Width and height threshold to filter bboxes. + If the height or width of a box is smaller than this value, it + will be removed. Defaults to 2. + min_area_ratio (float): Threshold of area ratio between + original bboxes and wrapped bboxes. If smaller than this value, + the box will be removed. Defaults to 0.1. + use_mask_refine (bool): Whether to refine bbox by mask. Deprecated. + max_aspect_ratio (float): Aspect ratio of width and height + threshold to filter bboxes. If max(h/w, w/h) larger than this + value, the box will be removed. Defaults to 20. + resample_num (int): Number of poly to resample to. + """ + + def __init__(self, + max_rotate_degree: float = 10.0, + max_translate_ratio: float = 0.1, + scaling_ratio_range: Tuple[float, float] = (0.5, 1.5), + max_shear_degree: float = 2.0, + border: Tuple[int, int] = (0, 0), + border_val: Tuple[int, int, int] = (114, 114, 114), + bbox_clip_border: bool = True, + min_bbox_size: int = 2, + min_area_ratio: float = 0.1, + use_mask_refine: bool = False, + max_aspect_ratio: float = 20., + resample_num: int = 1000): + assert 0 <= max_translate_ratio <= 1 + assert scaling_ratio_range[0] <= scaling_ratio_range[1] + assert scaling_ratio_range[0] > 0 + self.max_rotate_degree = max_rotate_degree + self.max_translate_ratio = max_translate_ratio + self.scaling_ratio_range = scaling_ratio_range + self.max_shear_degree = max_shear_degree + self.border = border + self.border_val = border_val + self.bbox_clip_border = bbox_clip_border + self.min_bbox_size = min_bbox_size + self.min_area_ratio = min_area_ratio + # The use_mask_refine parameter has been deprecated. + self.use_mask_refine = use_mask_refine + self.max_aspect_ratio = max_aspect_ratio + self.resample_num = resample_num + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """The YOLOv5 random affine transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + img = results['img'] + # self.border is wh format + height = img.shape[0] + self.border[1] * 2 + width = img.shape[1] + self.border[0] * 2 + + # Note: Different from YOLOX + center_matrix = np.eye(3, dtype=np.float32) + center_matrix[0, 2] = -img.shape[1] / 2 + center_matrix[1, 2] = -img.shape[0] / 2 + + warp_matrix, scaling_ratio = self._get_random_homography_matrix( + height, width) + warp_matrix = warp_matrix @ center_matrix + + img = cv2.warpPerspective( + img, + warp_matrix, + dsize=(width, height), + borderValue=self.border_val) + results['img'] = img + results['img_shape'] = img.shape + img_h, img_w = img.shape[:2] + + bboxes = results['gt_bboxes'] + num_bboxes = len(bboxes) + if num_bboxes: + orig_bboxes = bboxes.clone() + orig_bboxes.rescale_([scaling_ratio, scaling_ratio]) + if 'gt_masks' in results: + # If the dataset has annotations of mask, + # the mask will be used to refine bbox. + gt_masks = results['gt_masks'] + + gt_masks_resample = self.resample_masks(gt_masks) + gt_masks = self.warp_mask(gt_masks_resample, warp_matrix, + img_h, img_w) + + # refine bboxes by masks + bboxes = self.segment2box(gt_masks, height, width) + # filter bboxes outside image + valid_index = self.filter_gt_bboxes(orig_bboxes, + bboxes).numpy() + if self.bbox_clip_border: + bboxes.clip_([height - 1e-3, width - 1e-3]) + gt_masks = self.clip_polygons(gt_masks, height, width) + results['gt_masks'] = gt_masks[valid_index] + else: + bboxes.project_(warp_matrix) + if self.bbox_clip_border: + bboxes.clip_([height, width]) + + # filter bboxes + # Be careful: valid_index must convert to numpy, + # otherwise it will raise out of bounds when len(valid_index)=1 + valid_index = self.filter_gt_bboxes(orig_bboxes, + bboxes).numpy() + + results['gt_bboxes'] = bboxes[valid_index] + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + valid_index] + results['gt_ignore_flags'] = results['gt_ignore_flags'][ + valid_index] + else: + if 'gt_masks' in results: + results['gt_masks'] = PolygonMasks([], img_h, img_w) + + return results + + def segment2box(self, gt_masks: PolygonMasks, height: int, + width: int) -> HorizontalBoxes: + """ + Convert 1 segment label to 1 box label, applying inside-image + constraint i.e. (xy1, xy2, ...) to (xyxy) + Args: + gt_masks (torch.Tensor): the segment label + width (int): the width of the image. Defaults to 640 + height (int): The height of the image. Defaults to 640 + Returns: + HorizontalBoxes: the clip bboxes from gt_masks. + """ + bboxes = [] + for _, poly_per_obj in enumerate(gt_masks): + # simply use a number that is big enough for comparison with + # coordinates + xy_min = np.array([width * 2, height * 2], dtype=np.float32) + xy_max = np.zeros(2, dtype=np.float32) - 1 + + for p in poly_per_obj: + xy = np.array(p).reshape(-1, 2).astype(np.float32) + x, y = xy.T + inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height) + x, y = x[inside], y[inside] + if not any(x): + continue + xy = np.stack([x, y], axis=0).T + + xy_min = np.minimum(xy_min, np.min(xy, axis=0)) + xy_max = np.maximum(xy_max, np.max(xy, axis=0)) + if xy_max[0] == -1: + bbox = np.zeros(4, dtype=np.float32) + else: + bbox = np.concatenate([xy_min, xy_max], axis=0) + bboxes.append(bbox) + + return HorizontalBoxes(np.stack(bboxes, axis=0)) + + # TODO: Move to mmdet + def clip_polygons(self, gt_masks: PolygonMasks, height: int, + width: int) -> PolygonMasks: + """Function to clip points of polygons with height and width. + + Args: + gt_masks (PolygonMasks): Annotations of instance segmentation. + height (int): height of clip border. + width (int): width of clip border. + Return: + clipped_masks (PolygonMasks): + Clip annotations of instance segmentation. + """ + if len(gt_masks) == 0: + clipped_masks = PolygonMasks([], height, width) + else: + clipped_masks = [] + for poly_per_obj in gt_masks: + clipped_poly_per_obj = [] + for p in poly_per_obj: + p = p.copy() + p[0::2] = p[0::2].clip(0, width) + p[1::2] = p[1::2].clip(0, height) + clipped_poly_per_obj.append(p) + clipped_masks.append(clipped_poly_per_obj) + clipped_masks = PolygonMasks(clipped_masks, height, width) + return clipped_masks + + @staticmethod + def warp_poly(poly: np.ndarray, warp_matrix: np.ndarray, img_w: int, + img_h: int) -> np.ndarray: + """Function to warp one mask and filter points outside image. + + Args: + poly (np.ndarray): Segmentation annotation with shape (n, ) and + with format (x1, y1, x2, y2, ...). + warp_matrix (np.ndarray): Affine transformation matrix. + Shape: (3, 3). + img_w (int): Width of output image. + img_h (int): Height of output image. + """ + # TODO: Current logic may cause retained masks unusable for + # semantic segmentation training, which is same as official + # implementation. + poly = poly.reshape((-1, 2)) + poly = np.concatenate((poly, np.ones( + (len(poly), 1), dtype=poly.dtype)), + axis=-1) + # transform poly + poly = poly @ warp_matrix.T + poly = poly[:, :2] / poly[:, 2:3] + + return poly.reshape(-1) + + def warp_mask(self, gt_masks: PolygonMasks, warp_matrix: np.ndarray, + img_w: int, img_h: int) -> PolygonMasks: + """Warp masks by warp_matrix and retain masks inside image after + warping. + + Args: + gt_masks (PolygonMasks): Annotations of semantic segmentation. + warp_matrix (np.ndarray): Affine transformation matrix. + Shape: (3, 3). + img_w (int): Width of output image. + img_h (int): Height of output image. + + Returns: + PolygonMasks: Masks after warping. + """ + masks = gt_masks.masks + + new_masks = [] + for poly_per_obj in masks: + warpped_poly_per_obj = [] + # One gt may have multiple masks. + for poly in poly_per_obj: + valid_poly = self.warp_poly(poly, warp_matrix, img_w, img_h) + if len(valid_poly): + warpped_poly_per_obj.append(valid_poly.reshape(-1)) + # If all the masks are invalid, + # add [0, 0, 0, 0, 0, 0,] here. + if not warpped_poly_per_obj: + # This will be filtered in function `filter_gt_bboxes`. + warpped_poly_per_obj = [ + np.zeros(6, dtype=poly_per_obj[0].dtype) + ] + new_masks.append(warpped_poly_per_obj) + + gt_masks = PolygonMasks(new_masks, img_h, img_w) + return gt_masks + + def resample_masks(self, gt_masks: PolygonMasks) -> PolygonMasks: + """Function to resample each mask annotation with shape (2 * n, ) to + shape (resample_num * 2, ). + + Args: + gt_masks (PolygonMasks): Annotations of semantic segmentation. + """ + masks = gt_masks.masks + new_masks = [] + for poly_per_obj in masks: + resample_poly_per_obj = [] + for poly in poly_per_obj: + poly = poly.reshape((-1, 2)) # xy + poly = np.concatenate((poly, poly[0:1, :]), axis=0) + x = np.linspace(0, len(poly) - 1, self.resample_num) + xp = np.arange(len(poly)) + poly = np.concatenate([ + np.interp(x, xp, poly[:, i]) for i in range(2) + ]).reshape(2, -1).T.reshape(-1) + resample_poly_per_obj.append(poly) + new_masks.append(resample_poly_per_obj) + return PolygonMasks(new_masks, gt_masks.height, gt_masks.width) + + def filter_gt_bboxes(self, origin_bboxes: HorizontalBoxes, + wrapped_bboxes: HorizontalBoxes) -> torch.Tensor: + """Filter gt bboxes. + + Args: + origin_bboxes (HorizontalBoxes): Origin bboxes. + wrapped_bboxes (HorizontalBoxes): Wrapped bboxes + + Returns: + dict: The result dict. + """ + origin_w = origin_bboxes.widths + origin_h = origin_bboxes.heights + wrapped_w = wrapped_bboxes.widths + wrapped_h = wrapped_bboxes.heights + aspect_ratio = np.maximum(wrapped_w / (wrapped_h + 1e-16), + wrapped_h / (wrapped_w + 1e-16)) + + wh_valid_idx = (wrapped_w > self.min_bbox_size) & \ + (wrapped_h > self.min_bbox_size) + area_valid_idx = wrapped_w * wrapped_h / (origin_w * origin_h + + 1e-16) > self.min_area_ratio + aspect_ratio_valid_idx = aspect_ratio < self.max_aspect_ratio + return wh_valid_idx & area_valid_idx & aspect_ratio_valid_idx + + @cache_randomness + def _get_random_homography_matrix(self, height: int, + width: int) -> Tuple[np.ndarray, float]: + """Get random homography matrix. + + Args: + height (int): Image height. + width (int): Image width. + + Returns: + Tuple[np.ndarray, float]: The result of warp_matrix and + scaling_ratio. + """ + # Rotation + rotation_degree = random.uniform(-self.max_rotate_degree, + self.max_rotate_degree) + rotation_matrix = self._get_rotation_matrix(rotation_degree) + + # Scaling + scaling_ratio = random.uniform(self.scaling_ratio_range[0], + self.scaling_ratio_range[1]) + scaling_matrix = self._get_scaling_matrix(scaling_ratio) + + # Shear + x_degree = random.uniform(-self.max_shear_degree, + self.max_shear_degree) + y_degree = random.uniform(-self.max_shear_degree, + self.max_shear_degree) + shear_matrix = self._get_shear_matrix(x_degree, y_degree) + + # Translation + trans_x = random.uniform(0.5 - self.max_translate_ratio, + 0.5 + self.max_translate_ratio) * width + trans_y = random.uniform(0.5 - self.max_translate_ratio, + 0.5 + self.max_translate_ratio) * height + translate_matrix = self._get_translation_matrix(trans_x, trans_y) + warp_matrix = ( + translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix) + return warp_matrix, scaling_ratio + + @staticmethod + def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray: + """Get rotation matrix. + + Args: + rotate_degrees (float): Rotate degrees. + + Returns: + np.ndarray: The rotation matrix. + """ + radian = math.radians(rotate_degrees) + rotation_matrix = np.array( + [[np.cos(radian), -np.sin(radian), 0.], + [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]], + dtype=np.float32) + return rotation_matrix + + @staticmethod + def _get_scaling_matrix(scale_ratio: float) -> np.ndarray: + """Get scaling matrix. + + Args: + scale_ratio (float): Scale ratio. + + Returns: + np.ndarray: The scaling matrix. + """ + scaling_matrix = np.array( + [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]], + dtype=np.float32) + return scaling_matrix + + @staticmethod + def _get_shear_matrix(x_shear_degrees: float, + y_shear_degrees: float) -> np.ndarray: + """Get shear matrix. + + Args: + x_shear_degrees (float): X shear degrees. + y_shear_degrees (float): Y shear degrees. + + Returns: + np.ndarray: The shear matrix. + """ + x_radian = math.radians(x_shear_degrees) + y_radian = math.radians(y_shear_degrees) + shear_matrix = np.array([[1, np.tan(x_radian), 0.], + [np.tan(y_radian), 1, 0.], [0., 0., 1.]], + dtype=np.float32) + return shear_matrix + + @staticmethod + def _get_translation_matrix(x: float, y: float) -> np.ndarray: + """Get translation matrix. + + Args: + x (float): X translation. + y (float): Y translation. + + Returns: + np.ndarray: The translation matrix. + """ + translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]], + dtype=np.float32) + return translation_matrix + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(max_rotate_degree={self.max_rotate_degree}, ' + repr_str += f'max_translate_ratio={self.max_translate_ratio}, ' + repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, ' + repr_str += f'max_shear_degree={self.max_shear_degree}, ' + repr_str += f'border={self.border}, ' + repr_str += f'border_val={self.border_val}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + +@TRANSFORMS.register_module() +class PPYOLOERandomDistort(BaseTransform): + """Random hue, saturation, contrast and brightness distortion. + + Required Keys: + + - img + + Modified Keys: + + - img (np.float32) + + Args: + hue_cfg (dict): Hue settings. Defaults to dict(min=-18, + max=18, prob=0.5). + saturation_cfg (dict): Saturation settings. Defaults to dict( + min=0.5, max=1.5, prob=0.5). + contrast_cfg (dict): Contrast settings. Defaults to dict( + min=0.5, max=1.5, prob=0.5). + brightness_cfg (dict): Brightness settings. Defaults to dict( + min=0.5, max=1.5, prob=0.5). + num_distort_func (int): The number of distort function. Defaults + to 4. + """ + + def __init__(self, + hue_cfg: dict = dict(min=-18, max=18, prob=0.5), + saturation_cfg: dict = dict(min=0.5, max=1.5, prob=0.5), + contrast_cfg: dict = dict(min=0.5, max=1.5, prob=0.5), + brightness_cfg: dict = dict(min=0.5, max=1.5, prob=0.5), + num_distort_func: int = 4): + self.hue_cfg = hue_cfg + self.saturation_cfg = saturation_cfg + self.contrast_cfg = contrast_cfg + self.brightness_cfg = brightness_cfg + self.num_distort_func = num_distort_func + assert 0 < self.num_distort_func <= 4, \ + 'num_distort_func must > 0 and <= 4' + for cfg in [ + self.hue_cfg, self.saturation_cfg, self.contrast_cfg, + self.brightness_cfg + ]: + assert 0. <= cfg['prob'] <= 1., 'prob must >=0 and <=1' + + def transform_hue(self, results): + """Transform hue randomly.""" + if random.uniform(0., 1.) >= self.hue_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.hue_cfg['min'], self.hue_cfg['max']) + u = np.cos(delta * np.pi) + w = np.sin(delta * np.pi) + delta_iq = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]]) + rgb2yiq_matrix = np.array([[0.114, 0.587, 0.299], + [-0.321, -0.274, 0.596], + [0.311, -0.523, 0.211]]) + yiq2rgb_matric = np.array([[1.0, -1.107, 1.705], [1.0, -0.272, -0.647], + [1.0, 0.956, 0.621]]) + t = np.dot(np.dot(yiq2rgb_matric, delta_iq), rgb2yiq_matrix).T + img = np.dot(img, t) + results['img'] = img + return results + + def transform_saturation(self, results): + """Transform saturation randomly.""" + if random.uniform(0., 1.) >= self.saturation_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.saturation_cfg['min'], + self.saturation_cfg['max']) + + # convert bgr img to gray img + gray = img * np.array([[[0.114, 0.587, 0.299]]], dtype=np.float32) + gray = gray.sum(axis=2, keepdims=True) + gray *= (1.0 - delta) + img *= delta + img += gray + results['img'] = img + return results + + def transform_contrast(self, results): + """Transform contrast randomly.""" + if random.uniform(0., 1.) >= self.contrast_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.contrast_cfg['min'], + self.contrast_cfg['max']) + img *= delta + results['img'] = img + return results + + def transform_brightness(self, results): + """Transform brightness randomly.""" + if random.uniform(0., 1.) >= self.brightness_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.brightness_cfg['min'], + self.brightness_cfg['max']) + img += delta + results['img'] = img + return results + + def transform(self, results: dict) -> dict: + """The hue, saturation, contrast and brightness distortion function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + results['img'] = results['img'].astype(np.float32) + + functions = [ + self.transform_brightness, self.transform_contrast, + self.transform_saturation, self.transform_hue + ] + distortions = random.permutation(functions)[:self.num_distort_func] + for func in distortions: + results = func(results) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(hue_cfg={self.hue_cfg}, ' + repr_str += f'saturation_cfg={self.saturation_cfg}, ' + repr_str += f'contrast_cfg={self.contrast_cfg}, ' + repr_str += f'brightness_cfg={self.brightness_cfg}, ' + repr_str += f'num_distort_func={self.num_distort_func})' + return repr_str + + +@TRANSFORMS.register_module() +class PPYOLOERandomCrop(BaseTransform): + """Random crop the img and bboxes. Different thresholds are used in PPYOLOE + to judge whether the clipped image meets the requirements. This + implementation is different from the implementation of RandomCrop in mmdet. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Added Keys: + - pad_param (np.float32) + + Args: + aspect_ratio (List[float]): Aspect ratio of cropped region. Default to + [.5, 2]. + thresholds (List[float]): Iou thresholds for deciding a valid bbox crop + in [min, max] format. Defaults to [.0, .1, .3, .5, .7, .9]. + scaling (List[float]): Ratio between a cropped region and the original + image in [min, max] format. Default to [.3, 1.]. + num_attempts (int): Number of tries for each threshold before + giving up. Default to 50. + allow_no_crop (bool): Allow return without actually cropping them. + Default to True. + cover_all_box (bool): Ensure all bboxes are covered in the final crop. + Default to False. + """ + + def __init__(self, + aspect_ratio: List[float] = [.5, 2.], + thresholds: List[float] = [.0, .1, .3, .5, .7, .9], + scaling: List[float] = [.3, 1.], + num_attempts: int = 50, + allow_no_crop: bool = True, + cover_all_box: bool = False): + self.aspect_ratio = aspect_ratio + self.thresholds = thresholds + self.scaling = scaling + self.num_attempts = num_attempts + self.allow_no_crop = allow_no_crop + self.cover_all_box = cover_all_box + + def _crop_data(self, results: dict, crop_box: Tuple[int, int, int, int], + valid_inds: np.ndarray) -> Union[dict, None]: + """Function to randomly crop images, bounding boxes, masks, semantic + segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + crop_box (Tuple[int, int, int, int]): Expected absolute coordinates + for cropping, (x1, y1, x2, y2). + valid_inds (np.ndarray): The indexes of gt that needs to be + retained. + + Returns: + results (Union[dict, None]): Randomly cropped results, 'img_shape' + key in result dict is updated according to crop size. None will + be returned when there is no valid bbox after cropping. + """ + # crop the image + img = results['img'] + crop_x1, crop_y1, crop_x2, crop_y2 = crop_box + img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] + results['img'] = img + img_shape = img.shape + results['img_shape'] = img.shape + + # crop bboxes accordingly and clip to the image boundary + if results.get('gt_bboxes', None) is not None: + bboxes = results['gt_bboxes'] + bboxes.translate_([-crop_x1, -crop_y1]) + bboxes.clip_(img_shape[:2]) + + results['gt_bboxes'] = bboxes[valid_inds] + + if results.get('gt_ignore_flags', None) is not None: + results['gt_ignore_flags'] = \ + results['gt_ignore_flags'][valid_inds] + + if results.get('gt_bboxes_labels', None) is not None: + results['gt_bboxes_labels'] = \ + results['gt_bboxes_labels'][valid_inds] + + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'][ + valid_inds.nonzero()[0]].crop( + np.asarray([crop_x1, crop_y1, crop_x2, crop_y2])) + + # crop semantic seg + if results.get('gt_seg_map', None) is not None: + results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2, + crop_x1:crop_x2] + + return results + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """The random crop transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + if results.get('gt_bboxes', None) is None or len( + results['gt_bboxes']) == 0: + return results + + orig_img_h, orig_img_w = results['img'].shape[:2] + gt_bboxes = results['gt_bboxes'] + + thresholds = list(self.thresholds) + if self.allow_no_crop: + thresholds.append('no_crop') + random.shuffle(thresholds) + + for thresh in thresholds: + # Determine the coordinates for cropping + if thresh == 'no_crop': + return results + + found = False + for i in range(self.num_attempts): + crop_h, crop_w = self._get_crop_size((orig_img_h, orig_img_w)) + if self.aspect_ratio is None: + if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0: + continue + + # get image crop_box + margin_h = max(orig_img_h - crop_h, 0) + margin_w = max(orig_img_w - crop_w, 0) + offset_h, offset_w = self._rand_offset((margin_h, margin_w)) + crop_y1, crop_y2 = offset_h, offset_h + crop_h + crop_x1, crop_x2 = offset_w, offset_w + crop_w + + crop_box = [crop_x1, crop_y1, crop_x2, crop_y2] + # Calculate the iou between gt_bboxes and crop_boxes + iou = self._iou_matrix(gt_bboxes, + np.array([crop_box], dtype=np.float32)) + # If the maximum value of the iou is less than thresh, + # the current crop_box is considered invalid. + if iou.max() < thresh: + continue + + # If cover_all_box == True and the minimum value of + # the iou is less than thresh, the current crop_box + # is considered invalid. + if self.cover_all_box and iou.min() < thresh: + continue + + # Get which gt_bboxes to keep after cropping. + valid_inds = self._get_valid_inds( + gt_bboxes, np.array(crop_box, dtype=np.float32)) + if valid_inds.size > 0: + found = True + break + + if found: + results = self._crop_data(results, crop_box, valid_inds) + return results + return results + + @cache_randomness + def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]: + """Randomly generate crop offset. + + Args: + margin (Tuple[int, int]): The upper bound for the offset generated + randomly. + + Returns: + Tuple[int, int]: The random offset for the crop. + """ + margin_h, margin_w = margin + offset_h = np.random.randint(0, margin_h + 1) + offset_w = np.random.randint(0, margin_w + 1) + + return (offset_h, offset_w) + + @cache_randomness + def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]: + """Randomly generates the crop size based on `image_size`. + + Args: + image_size (Tuple[int, int]): (h, w). + + Returns: + crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels. + """ + h, w = image_size + scale = random.uniform(*self.scaling) + if self.aspect_ratio is not None: + min_ar, max_ar = self.aspect_ratio + aspect_ratio = random.uniform( + max(min_ar, scale**2), min(max_ar, scale**-2)) + h_scale = scale / np.sqrt(aspect_ratio) + w_scale = scale * np.sqrt(aspect_ratio) + else: + h_scale = random.uniform(*self.scaling) + w_scale = random.uniform(*self.scaling) + crop_h = h * h_scale + crop_w = w * w_scale + return int(crop_h), int(crop_w) + + def _iou_matrix(self, + gt_bbox: HorizontalBoxes, + crop_bbox: np.ndarray, + eps: float = 1e-10) -> np.ndarray: + """Calculate iou between gt and image crop box. + + Args: + gt_bbox (HorizontalBoxes): Ground truth bounding boxes. + crop_bbox (np.ndarray): Image crop coordinates in + [x1, y1, x2, y2] format. + eps (float): Default to 1e-10. + Return: + (np.ndarray): IoU. + """ + gt_bbox = gt_bbox.tensor.numpy() + lefttop = np.maximum(gt_bbox[:, np.newaxis, :2], crop_bbox[:, :2]) + rightbottom = np.minimum(gt_bbox[:, np.newaxis, 2:], crop_bbox[:, 2:]) + + overlap = np.prod( + rightbottom - lefttop, + axis=2) * (lefttop < rightbottom).all(axis=2) + area_gt_bbox = np.prod(gt_bbox[:, 2:] - gt_bbox[:, :2], axis=1) + area_crop_bbox = np.prod(crop_bbox[:, 2:] - crop_bbox[:, :2], axis=1) + area_o = (area_gt_bbox[:, np.newaxis] + area_crop_bbox - overlap) + return overlap / (area_o + eps) + + def _get_valid_inds(self, gt_bbox: HorizontalBoxes, + img_crop_bbox: np.ndarray) -> np.ndarray: + """Get which Bboxes to keep at the current cropping coordinates. + + Args: + gt_bbox (HorizontalBoxes): Ground truth bounding boxes. + img_crop_bbox (np.ndarray): Image crop coordinates in + [x1, y1, x2, y2] format. + + Returns: + (np.ndarray): Valid indexes. + """ + cropped_box = gt_bbox.tensor.numpy().copy() + gt_bbox = gt_bbox.tensor.numpy().copy() + + cropped_box[:, :2] = np.maximum(gt_bbox[:, :2], img_crop_bbox[:2]) + cropped_box[:, 2:] = np.minimum(gt_bbox[:, 2:], img_crop_bbox[2:]) + cropped_box[:, :2] -= img_crop_bbox[:2] + cropped_box[:, 2:] -= img_crop_bbox[:2] + + centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2 + valid = np.logical_and(img_crop_bbox[:2] <= centers, + centers < img_crop_bbox[2:]).all(axis=1) + valid = np.logical_and( + valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) + + return np.where(valid)[0] + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(aspect_ratio={self.aspect_ratio}, ' + repr_str += f'thresholds={self.thresholds}, ' + repr_str += f'scaling={self.scaling}, ' + repr_str += f'num_attempts={self.num_attempts}, ' + repr_str += f'allow_no_crop={self.allow_no_crop}, ' + repr_str += f'cover_all_box={self.cover_all_box})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5CopyPaste(BaseTransform): + """Copy-Paste used in YOLOv5 and YOLOv8. + + This transform randomly copy some objects in the image to the mirror + position of the image.It is different from the `CopyPaste` in mmdet. + + Required Keys: + + - img (np.uint8) + - gt_bboxes (BaseBoxes[torch.float32]) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - gt_masks (PolygonMasks) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (optional) + - gt_masks (optional) + + Args: + ioa_thresh (float): Ioa thresholds for deciding valid bbox. + prob (float): Probability of choosing objects. + Defaults to 0.5. + """ + + def __init__(self, ioa_thresh: float = 0.3, prob: float = 0.5): + self.ioa_thresh = ioa_thresh + self.prob = prob + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """The YOLOv5 and YOLOv8 Copy-Paste transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + if len(results.get('gt_masks', [])) == 0: + return results + gt_masks = results['gt_masks'] + assert isinstance(gt_masks, PolygonMasks), \ + 'only support type of PolygonMasks,' \ + ' but get type: %s' % type(gt_masks) + gt_bboxes = results['gt_bboxes'] + gt_bboxes_labels = results.get('gt_bboxes_labels', None) + img = results['img'] + img_h, img_w = img.shape[:2] + + # calculate ioa + gt_bboxes_flip = deepcopy(gt_bboxes) + gt_bboxes_flip.flip_(img.shape) + + ioa = self.bbox_ioa(gt_bboxes_flip, gt_bboxes) + indexes = torch.nonzero((ioa < self.ioa_thresh).all(1))[:, 0] + n = len(indexes) + valid_inds = random.choice( + indexes, size=round(self.prob * n), replace=False) + if len(valid_inds) == 0: + return results + + if gt_bboxes_labels is not None: + # prepare labels + gt_bboxes_labels = np.concatenate( + (gt_bboxes_labels, gt_bboxes_labels[valid_inds]), axis=0) + + # prepare bboxes + copypaste_bboxes = gt_bboxes_flip[valid_inds] + gt_bboxes = gt_bboxes.cat([gt_bboxes, copypaste_bboxes]) + + # prepare images + copypaste_gt_masks = gt_masks[valid_inds] + copypaste_gt_masks_flip = copypaste_gt_masks.flip() + # convert poly format to bitmap format + # example: poly: [[array(0.0, 0.0, 10.0, 0.0, 10.0, 10.0, 0.0, 10.0]] + # -> bitmap: a mask with shape equal to (1, img_h, img_w) + # # type1 low speed + # copypaste_gt_masks_bitmap = copypaste_gt_masks.to_ndarray() + # copypaste_mask = np.sum(copypaste_gt_masks_bitmap, axis=0) > 0 + + # type2 + copypaste_mask = np.zeros((img_h, img_w), dtype=np.uint8) + for poly in copypaste_gt_masks.masks: + poly = [i.reshape((-1, 1, 2)).astype(np.int32) for i in poly] + cv2.drawContours(copypaste_mask, poly, -1, (1, ), cv2.FILLED) + + copypaste_mask = copypaste_mask.astype(bool) + + # copy objects, and paste to the mirror position of the image + copypaste_mask_flip = mmcv.imflip( + copypaste_mask, direction='horizontal') + copypaste_img = mmcv.imflip(img, direction='horizontal') + img[copypaste_mask_flip] = copypaste_img[copypaste_mask_flip] + + # prepare masks + gt_masks = copypaste_gt_masks.cat([gt_masks, copypaste_gt_masks_flip]) + + if 'gt_ignore_flags' in results: + # prepare gt_ignore_flags + gt_ignore_flags = results['gt_ignore_flags'] + gt_ignore_flags = np.concatenate( + [gt_ignore_flags, gt_ignore_flags[valid_inds]], axis=0) + results['gt_ignore_flags'] = gt_ignore_flags + + results['img'] = img + results['gt_bboxes'] = gt_bboxes + if gt_bboxes_labels is not None: + results['gt_bboxes_labels'] = gt_bboxes_labels + results['gt_masks'] = gt_masks + + return results + + @staticmethod + def bbox_ioa(gt_bboxes_flip: HorizontalBoxes, + gt_bboxes: HorizontalBoxes, + eps: float = 1e-7) -> np.ndarray: + """Calculate ioa between gt_bboxes_flip and gt_bboxes. + + Args: + gt_bboxes_flip (HorizontalBoxes): Flipped ground truth + bounding boxes. + gt_bboxes (HorizontalBoxes): Ground truth bounding boxes. + eps (float): Default to 1e-10. + Return: + (Tensor): Ioa. + """ + gt_bboxes_flip = gt_bboxes_flip.tensor + gt_bboxes = gt_bboxes.tensor + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = gt_bboxes_flip.T + b2_x1, b2_y1, b2_x2, b2_y2 = gt_bboxes.T + + # Intersection area + inter_area = (torch.minimum(b1_x2[:, None], + b2_x2) - torch.maximum(b1_x1[:, None], + b2_x1)).clip(0) * \ + (torch.minimum(b1_y2[:, None], + b2_y2) - torch.maximum(b1_y1[:, None], + b2_y1)).clip(0) + + # box2 area + box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps + + # Intersection over box2 area + return inter_area / box2_area + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(ioa_thresh={self.ioa_thresh},' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class RemoveDataElement(BaseTransform): + """Remove unnecessary data element in results. + + Args: + keys (Union[str, Sequence[str]]): Keys need to be removed. + """ + + def __init__(self, keys: Union[str, Sequence[str]]): + self.keys = [keys] if isinstance(keys, str) else keys + + def transform(self, results: dict) -> dict: + for key in self.keys: + results.pop(key, None) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(keys={self.keys})' + return repr_str + + +@TRANSFORMS.register_module() +class RegularizeRotatedBox(BaseTransform): + """Regularize rotated boxes. + + Due to the angle periodicity, one rotated box can be represented in + many different (x, y, w, h, t). To make each rotated box unique, + ``regularize_boxes`` will take the remainder of the angle divided by + 180 degrees. + + For convenience, three angle_version can be used here: + + - 'oc': OpenCV Definition. Has the same box representation as + ``cv2.minAreaRect`` the angle ranges in [-90, 0). + - 'le90': Long Edge Definition (90). the angle ranges in [-90, 90). + The width is always longer than the height. + - 'le135': Long Edge Definition (135). the angle ranges in [-45, 135). + The width is always longer than the height. + + Required Keys: + + - gt_bboxes (RotatedBoxes[torch.float32]) + + Modified Keys: + + - gt_bboxes + + Args: + angle_version (str): Angle version. Can only be 'oc', + 'le90', or 'le135'. Defaults to 'le90. + """ + + def __init__(self, angle_version='le90') -> None: + self.angle_version = angle_version + try: + from mmrotate.structures.bbox import RotatedBoxes + self.box_type = RotatedBoxes + except ImportError: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + def transform(self, results: dict) -> dict: + assert isinstance(results['gt_bboxes'], self.box_type) + results['gt_bboxes'] = self.box_type( + results['gt_bboxes'].regularize_boxes(self.angle_version)) + return results + + +@TRANSFORMS.register_module() +class Polygon2Mask(BaseTransform): + """Polygons to bitmaps in YOLOv5. + + Args: + downsample_ratio (int): Downsample ratio of mask. + mask_overlap (bool): Whether to use maskoverlap in mask process. + When set to True, the implementation here is the same as the + official, with higher training speed. If set to True, all gt masks + will compress into one overlap mask, the value of mask indicates + the index of gt masks. If set to False, one mask is a binary mask. + Default to True. + coco_style (bool): Whether to use coco_style to convert the polygons to + bitmaps. Note that this option is only used to test if there is an + improvement in training speed and we recommend setting it to False. + """ + + def __init__(self, + downsample_ratio: int = 4, + mask_overlap: bool = True, + coco_style: bool = False): + self.downsample_ratio = downsample_ratio + self.mask_overlap = mask_overlap + self.coco_style = coco_style + + def polygon2mask(self, + img_shape: Tuple[int, int], + polygons: np.ndarray, + color: int = 1) -> np.ndarray: + """ + Args: + img_shape (tuple): The image size. + polygons (np.ndarray): [N, M], N is the number of polygons, + M is the number of points(Be divided by 2). + color (int): color in fillPoly. + Return: + np.ndarray: the overlap mask. + """ + nh, nw = (img_shape[0] // self.downsample_ratio, + img_shape[1] // self.downsample_ratio) + if self.coco_style: + # This practice can lead to the loss of small objects + # polygons = polygons.resize((nh, nw)).masks + # polygons = np.asarray(polygons).reshape(-1) + # mask = polygon_to_bitmap([polygons], nh, nw) + + polygons = np.asarray(polygons).reshape(-1) + mask = polygon_to_bitmap([polygons], img_shape[0], + img_shape[1]).astype(np.uint8) + mask = mmcv.imresize(mask, (nw, nh)) + else: + mask = np.zeros(img_shape, dtype=np.uint8) + polygons = np.asarray(polygons) + polygons = polygons.astype(np.int32) + shape = polygons.shape + polygons = polygons.reshape(shape[0], -1, 2) + cv2.fillPoly(mask, polygons, color=color) + # NOTE: fillPoly firstly then resize is trying the keep the same + # way of loss calculation when mask-ratio=1. + mask = mmcv.imresize(mask, (nw, nh)) + return mask + + def polygons2masks(self, + img_shape: Tuple[int, int], + polygons: PolygonMasks, + color: int = 1) -> np.ndarray: + """Return a list of bitmap masks. + + Args: + img_shape (tuple): The image size. + polygons (PolygonMasks): The mask annotations. + color (int): color in fillPoly. + Return: + List[np.ndarray]: the list of masks in bitmaps. + """ + if self.coco_style: + nh, nw = (img_shape[0] // self.downsample_ratio, + img_shape[1] // self.downsample_ratio) + masks = polygons.resize((nh, nw)).to_ndarray() + return masks + else: + masks = [] + for si in range(len(polygons)): + mask = self.polygon2mask(img_shape, polygons[si], color) + masks.append(mask) + return np.array(masks) + + def polygons2masks_overlap( + self, img_shape: Tuple[int, int], + polygons: PolygonMasks) -> Tuple[np.ndarray, np.ndarray]: + """Return a overlap mask and the sorted idx of area. + + Args: + img_shape (tuple): The image size. + polygons (PolygonMasks): The mask annotations. + color (int): color in fillPoly. + Return: + Tuple[np.ndarray, np.ndarray]: + the overlap mask and the sorted idx of area. + """ + masks = np.zeros((img_shape[0] // self.downsample_ratio, + img_shape[1] // self.downsample_ratio), + dtype=np.int32 if len(polygons) > 255 else np.uint8) + areas = [] + ms = [] + for si in range(len(polygons)): + mask = self.polygon2mask(img_shape, polygons[si], color=1) + ms.append(mask) + areas.append(mask.sum()) + areas = np.asarray(areas) + index = np.argsort(-areas) + ms = np.array(ms)[index] + for i in range(len(polygons)): + mask = ms[i] * (i + 1) + masks = masks + mask + masks = np.clip(masks, a_min=0, a_max=i + 1) + return masks, index + + def transform(self, results: dict) -> dict: + gt_masks = results['gt_masks'] + assert isinstance(gt_masks, PolygonMasks) + + if self.mask_overlap: + masks, sorted_idx = self.polygons2masks_overlap( + (gt_masks.height, gt_masks.width), gt_masks) + results['gt_bboxes'] = results['gt_bboxes'][sorted_idx] + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + sorted_idx] + + # In this case we put gt_masks in gt_panoptic_seg + results.pop('gt_masks') + results['gt_panoptic_seg'] = torch.from_numpy(masks[None]) + else: + masks = self.polygons2masks((gt_masks.height, gt_masks.width), + gt_masks, + color=1) + masks = torch.from_numpy(masks) + # Consistent logic with mmdet + results['gt_masks'] = masks + return results + + +@TRANSFORMS.register_module() +class FilterAnnotations(FilterDetAnnotations): + """Filter invalid annotations. + + In addition to the conditions checked by ``FilterDetAnnotations``, this + filter adds a new condition requiring instances to have at least one + visible keypoints. + """ + + def __init__(self, by_keypoints: bool = False, **kwargs) -> None: + # TODO: add more filter options + super().__init__(**kwargs) + self.by_keypoints = by_keypoints + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """Transform function to filter annotations. + + Args: + results (dict): Result dict. + Returns: + dict: Updated result dict. + """ + assert 'gt_bboxes' in results + gt_bboxes = results['gt_bboxes'] + if gt_bboxes.shape[0] == 0: + return results + + tests = [] + if self.by_box: + tests.append( + ((gt_bboxes.widths > self.min_gt_bbox_wh[0]) & + (gt_bboxes.heights > self.min_gt_bbox_wh[1])).numpy()) + + if self.by_mask: + assert 'gt_masks' in results + gt_masks = results['gt_masks'] + tests.append(gt_masks.areas >= self.min_gt_mask_area) + + if self.by_keypoints: + assert 'gt_keypoints' in results + num_keypoints = results['gt_keypoints'].num_keypoints + tests.append((num_keypoints > 0).numpy()) + + keep = tests[0] + for t in tests[1:]: + keep = keep & t + + if not keep.any(): + if self.keep_empty: + return None + + keys = ('gt_bboxes', 'gt_bboxes_labels', 'gt_masks', 'gt_ignore_flags', + 'gt_keypoints') + for key in keys: + if key in results: + results[key] = results[key][keep] + + return results + + +# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug +@TRANSFORMS.register_module() +class RandomAffine(MMDET_RandomAffine): + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + @autocast_box_type() + def transform(self, results: dict) -> dict: + img = results['img'] + height = img.shape[0] + self.border[1] * 2 + width = img.shape[1] + self.border[0] * 2 + + warp_matrix = self._get_random_homography_matrix(height, width) + + img = cv2.warpPerspective( + img, + warp_matrix, + dsize=(width, height), + borderValue=self.border_val) + results['img'] = img + results['img_shape'] = img.shape + + bboxes = results['gt_bboxes'] + num_bboxes = len(bboxes) + if num_bboxes: + bboxes.project_(warp_matrix) + if self.bbox_clip_border: + bboxes.clip_([height, width]) + # remove outside bbox + valid_index = bboxes.is_inside([height, width]).numpy() + results['gt_bboxes'] = bboxes[valid_index] + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + valid_index] + results['gt_ignore_flags'] = results['gt_ignore_flags'][ + valid_index] + + if 'gt_masks' in results: + raise NotImplementedError('RandomAffine only supports bbox.') + + if 'gt_keypoints' in results: + keypoints = results['gt_keypoints'] + keypoints.project_(warp_matrix) + if self.bbox_clip_border: + keypoints.clip_([height, width]) + results['gt_keypoints'] = keypoints[valid_index] + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(hue_delta={self.hue_delta}, ' + repr_str += f'saturation_delta={self.saturation_delta}, ' + repr_str += f'value_delta={self.value_delta})' + return repr_str + + +# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug +@TRANSFORMS.register_module() +class RandomFlip(MMDET_RandomFlip): + + @autocast_box_type() + def _flip(self, results: dict) -> None: + """Flip images, bounding boxes, and semantic segmentation map.""" + # flip image + results['img'] = mmcv.imflip( + results['img'], direction=results['flip_direction']) + + img_shape = results['img'].shape[:2] + + # flip bboxes + if results.get('gt_bboxes', None) is not None: + results['gt_bboxes'].flip_(img_shape, results['flip_direction']) + + # flip keypoints + if results.get('gt_keypoints', None) is not None: + results['gt_keypoints'].flip_(img_shape, results['flip_direction']) + + # flip masks + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'].flip( + results['flip_direction']) + + # flip segs + if results.get('gt_seg_map', None) is not None: + results['gt_seg_map'] = mmcv.imflip( + results['gt_seg_map'], direction=results['flip_direction']) + + # record homography matrix for flip + self._record_homography_matrix(results) + + +@TRANSFORMS.register_module() +class Resize(MMDET_Resize): + + def _resize_keypoints(self, results: dict) -> None: + """Resize bounding boxes with ``results['scale_factor']``.""" + if results.get('gt_keypoints', None) is not None: + results['gt_keypoints'].rescale_(results['scale_factor']) + if self.clip_object_border: + results['gt_keypoints'].clip_(results['img_shape']) + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function to resize images, bounding boxes and semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map', + 'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys + are updated in result dict. + """ + if self.scale: + results['scale'] = self.scale + else: + img_shape = results['img'].shape[:2] + results['scale'] = _scale_size(img_shape[::-1], self.scale_factor) + self._resize_img(results) + self._resize_bboxes(results) + self._resize_keypoints(results) + self._resize_masks(results) + self._resize_seg(results) + self._record_homography_matrix(results) + return results diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/utils.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..efa2ff5ef07d73e82c258474db7b0e49edc4825a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/utils.py @@ -0,0 +1,133 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence + +import numpy as np +import torch +from mmengine.dataset import COLLATE_FUNCTIONS +from mmengine.dist import get_dist_info + +from ..registry import TASK_UTILS + + +@COLLATE_FUNCTIONS.register_module() +def yolov5_collate(data_batch: Sequence, + use_ms_training: bool = False) -> dict: + """Rewrite collate_fn to get faster training speed. + + Args: + data_batch (Sequence): Batch of data. + use_ms_training (bool): Whether to use multi-scale training. + """ + batch_imgs = [] + batch_bboxes_labels = [] + batch_masks = [] + batch_keyponits = [] + batch_keypoints_visible = [] + for i in range(len(data_batch)): + datasamples = data_batch[i]['data_samples'] + inputs = data_batch[i]['inputs'] + batch_imgs.append(inputs) + + gt_bboxes = datasamples.gt_instances.bboxes.tensor + gt_labels = datasamples.gt_instances.labels + if 'masks' in datasamples.gt_instances: + masks = datasamples.gt_instances.masks + batch_masks.append(masks) + if 'gt_panoptic_seg' in datasamples: + batch_masks.append(datasamples.gt_panoptic_seg.pan_seg) + if 'keypoints' in datasamples.gt_instances: + keypoints = datasamples.gt_instances.keypoints + keypoints_visible = datasamples.gt_instances.keypoints_visible + batch_keyponits.append(keypoints) + batch_keypoints_visible.append(keypoints_visible) + + batch_idx = gt_labels.new_full((len(gt_labels), 1), i) + bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes), + dim=1) + batch_bboxes_labels.append(bboxes_labels) + collated_results = { + 'data_samples': { + 'bboxes_labels': torch.cat(batch_bboxes_labels, 0) + } + } + if len(batch_masks) > 0: + collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0) + + if len(batch_keyponits) > 0: + collated_results['data_samples']['keypoints'] = torch.cat( + batch_keyponits, 0) + collated_results['data_samples']['keypoints_visible'] = torch.cat( + batch_keypoints_visible, 0) + + if use_ms_training: + collated_results['inputs'] = batch_imgs + else: + collated_results['inputs'] = torch.stack(batch_imgs, 0) + return collated_results + + +@TASK_UTILS.register_module() +class BatchShapePolicy: + """BatchShapePolicy is only used in the testing phase, which can reduce the + number of pad pixels during batch inference. + + Args: + batch_size (int): Single GPU batch size during batch inference. + Defaults to 32. + img_size (int): Expected output image size. Defaults to 640. + size_divisor (int): The minimum size that is divisible + by size_divisor. Defaults to 32. + extra_pad_ratio (float): Extra pad ratio. Defaults to 0.5. + """ + + def __init__(self, + batch_size: int = 32, + img_size: int = 640, + size_divisor: int = 32, + extra_pad_ratio: float = 0.5): + self.img_size = img_size + self.size_divisor = size_divisor + self.extra_pad_ratio = extra_pad_ratio + _, world_size = get_dist_info() + # During multi-gpu testing, the batchsize should be multiplied by + # worldsize, so that the number of batches can be calculated correctly. + # The index of batches will affect the calculation of batch shape. + self.batch_size = batch_size * world_size + + def __call__(self, data_list: List[dict]) -> List[dict]: + image_shapes = [] + for data_info in data_list: + image_shapes.append((data_info['width'], data_info['height'])) + + image_shapes = np.array(image_shapes, dtype=np.float64) + + n = len(image_shapes) # number of images + batch_index = np.floor(np.arange(n) / self.batch_size).astype( + np.int64) # batch index + number_of_batches = batch_index[-1] + 1 # number of batches + + aspect_ratio = image_shapes[:, 1] / image_shapes[:, 0] # aspect ratio + irect = aspect_ratio.argsort() + + data_list = [data_list[i] for i in irect] + + aspect_ratio = aspect_ratio[irect] + # Set training image shapes + shapes = [[1, 1]] * number_of_batches + for i in range(number_of_batches): + aspect_ratio_index = aspect_ratio[batch_index == i] + min_index, max_index = aspect_ratio_index.min( + ), aspect_ratio_index.max() + if max_index < 1: + shapes[i] = [max_index, 1] + elif min_index > 1: + shapes[i] = [1, 1 / min_index] + + batch_shapes = np.ceil( + np.array(shapes) * self.img_size / self.size_divisor + + self.extra_pad_ratio).astype(np.int64) * self.size_divisor + + for i, data_info in enumerate(data_list): + data_info['batch_shape'] = batch_shapes[batch_index[i]] + + return data_list diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_coco.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..55bc899abfcceebfdadf7549e56336725d891dcb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_coco.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, Optional + +from mmdet.datasets import BaseDetDataset, CocoDataset + +from ..registry import DATASETS, TASK_UTILS + + +class BatchShapePolicyDataset(BaseDetDataset): + """Dataset with the batch shape policy that makes paddings with least + pixels during batch inference process, which does not require the image + scales of all batches to be the same throughout validation.""" + + def __init__(self, + *args, + batch_shapes_cfg: Optional[dict] = None, + **kwargs): + self.batch_shapes_cfg = batch_shapes_cfg + super().__init__(*args, **kwargs) + + def full_init(self): + """rewrite full_init() to be compatible with serialize_data in + BatchShapePolicy.""" + if self._fully_initialized: + return + # load data information + self.data_list = self.load_data_list() + + # batch_shapes_cfg + if self.batch_shapes_cfg: + batch_shapes_policy = TASK_UTILS.build(self.batch_shapes_cfg) + self.data_list = batch_shapes_policy(self.data_list) + del batch_shapes_policy + + # filter illegal data, such as data that has no annotations. + self.data_list = self.filter_data() + # Get subset data according to indices. + if self._indices is not None: + self.data_list = self._get_unserialized_subset(self._indices) + + # serialize data_list + if self.serialize_data: + self.data_bytes, self.data_address = self._serialize_data() + + self._fully_initialized = True + + def prepare_data(self, idx: int) -> Any: + """Pass the dataset to the pipeline during training to support mixed + data augmentation, such as Mosaic and MixUp.""" + if self.test_mode is False: + data_info = self.get_data_info(idx) + data_info['dataset'] = self + return self.pipeline(data_info) + else: + return super().prepare_data(idx) + + +@DATASETS.register_module() +class YOLOv5CocoDataset(BatchShapePolicyDataset, CocoDataset): + """Dataset for YOLOv5 COCO Dataset. + + We only add `BatchShapePolicy` function compared with CocoDataset. See + `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_crowdhuman.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_crowdhuman.py new file mode 100644 index 0000000000000000000000000000000000000000..486a8324fb4c7d8a34bf885f1818d2e6f974f6e7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_crowdhuman.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.datasets import CrowdHumanDataset + +from ..registry import DATASETS +from .yolov5_coco import BatchShapePolicyDataset + + +@DATASETS.register_module() +class YOLOv5CrowdHumanDataset(BatchShapePolicyDataset, CrowdHumanDataset): + """Dataset for YOLOv5 CrowdHuman Dataset. + + We only add `BatchShapePolicy` function compared with CrowdHumanDataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_dota.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..a9647981333ed725a568a293279873ab9e20db47 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_dota.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from ..registry import DATASETS + +try: + from mmrotate.datasets import DOTADataset + MMROTATE_AVAILABLE = True +except ImportError: + from mmengine.dataset import BaseDataset + DOTADataset = BaseDataset + MMROTATE_AVAILABLE = False + + +@DATASETS.register_module() +class YOLOv5DOTADataset(BatchShapePolicyDataset, DOTADataset): + """Dataset for YOLOv5 DOTA Dataset. + + We only add `BatchShapePolicy` function compared with DOTADataset. See + `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + + def __init__(self, *args, **kwargs): + if not MMROTATE_AVAILABLE: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + super().__init__(*args, **kwargs) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_voc.py b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..5be764f1db3097645ae1be387e45cafb1b460731 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/datasets/yolov5_voc.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.datasets import VOCDataset + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from ..registry import DATASETS + + +@DATASETS.register_module() +class YOLOv5VOCDataset(BatchShapePolicyDataset, VOCDataset): + """Dataset for YOLOv5 VOC Dataset. + + We only add `BatchShapePolicy` function compared with VOCDataset. See + `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4904a9058b41526d9719994ed718ae58336d290e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdeploy.codebase.base import MMCodebase + +from .models import * # noqa: F401,F403 +from .object_detection import MMYOLO, YOLOObjectDetection + +__all__ = ['MMCodebase', 'MMYOLO', 'YOLOObjectDetection'] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4b999a0161543d6a9d2ab56d797af740dc7261e4 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from . import dense_heads # noqa: F401,F403 diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/dense_heads/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cc423af3ec374cabe2b9f46d2fe4f4dc9755b8e3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/dense_heads/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from . import yolov5_head # noqa: F401,F403 + +__all__ = ['yolov5_head'] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/dense_heads/yolov5_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/dense_heads/yolov5_head.py new file mode 100644 index 0000000000000000000000000000000000000000..ac996ba41336243ef091e3e952430382be9ff978 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/dense_heads/yolov5_head.py @@ -0,0 +1,189 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from functools import partial +from typing import List, Optional, Tuple + +import torch +from mmdeploy.codebase.mmdet import get_post_processing_params +from mmdeploy.codebase.mmdet.models.layers import multiclass_nms +from mmdeploy.core import FUNCTION_REWRITER +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.deploy.models.layers import efficient_nms +from mmyolo.models.dense_heads import YOLOv5Head + + +def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: int) -> Tensor: + """Decode YOLOv5 bounding boxes. + + Args: + priors (Tensor): Prior boxes in center-offset form. + bbox_preds (Tensor): Predicted bounding boxes. + stride (int): Stride of the feature map. + + Returns: + Tensor: Decoded bounding boxes. + """ + bbox_preds = bbox_preds.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (bbox_preds[..., 2] * 2)**2 * w + h_pred = (bbox_preds[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1) + + return decoded_bboxes + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmyolo.models.dense_heads.yolov5_head.' + 'YOLOv5Head.predict_by_feat') +def yolov5_head__predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> Tuple[InstanceData]: + """Transform a batch of output features extracted by the head into + bbox results. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + Returns: + tuple[Tensor, Tensor]: The first item is an (N, num_box, 5) tensor, + where 5 represent (tl_x, tl_y, br_x, br_y, score), N is batch + size and the score between 0 and 1. The shape of the second + tensor in the tuple is (N, num_box), and each element + represents the class label of the corresponding box. + """ + ctx = FUNCTION_REWRITER.get_context() + detector_type = type(self) + deploy_cfg = ctx.cfg + use_efficientnms = deploy_cfg.get('use_efficientnms', False) + dtype = cls_scores[0].dtype + device = cls_scores[0].device + bbox_decoder = self.bbox_coder.decode + nms_func = multiclass_nms + if use_efficientnms: + if detector_type is YOLOv5Head: + nms_func = partial(efficient_nms, box_coding=0) + bbox_decoder = yolov5_bbox_decoder + else: + nms_func = efficient_nms + + assert len(cls_scores) == len(bbox_preds) + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + num_imgs = cls_scores[0].shape[0] + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, dtype=dtype, device=device) + + flatten_priors = torch.cat(mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size[0] * featmap_size[1] * self.num_base_priors, ), + stride) + for featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes) + for cls_score in cls_scores + ] + cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + + if objectnesses is not None: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1)) + + scores = cls_scores + + bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds, + flatten_stride) + + if not with_nms: + return bboxes, scores + + post_params = get_post_processing_params(deploy_cfg) + max_output_boxes_per_class = post_params.max_output_boxes_per_class + iou_threshold = cfg.nms.get('iou_threshold', post_params.iou_threshold) + score_threshold = cfg.get('score_thr', post_params.score_threshold) + pre_top_k = post_params.pre_top_k + keep_top_k = cfg.get('max_per_img', post_params.keep_top_k) + + return nms_func(bboxes, scores, max_output_boxes_per_class, iou_threshold, + score_threshold, pre_top_k, keep_top_k) + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmyolo.models.dense_heads.yolov5_head.' + 'YOLOv5Head.predict', + backend='rknn') +def yolov5_head__predict__rknn(self, x: Tuple[Tensor], *args, + **kwargs) -> Tuple[Tensor, Tensor, Tensor]: + """Perform forward propagation of the detection head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + """ + outs = self(x) + return outs + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmyolo.models.dense_heads.yolov5_head.' + 'YOLOv5HeadModule.forward', + backend='rknn') +def yolov5_head_module__forward__rknn( + self, x: Tensor, *args, **kwargs) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + out = [] + for i, feat in enumerate(x): + out.append(self.convs_pred[i](feat)) + return out diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/layers/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6017cf83425b640eb788a8abf6b253f29d759afb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/layers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_nms import efficient_nms + +__all__ = ['efficient_nms'] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/layers/bbox_nms.py b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/layers/bbox_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..4db81c0227a36e0315855082dcd8125e1f9be70a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/models/layers/bbox_nms.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmdeploy.core import mark +from torch import Tensor + + +def _efficient_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x, y, w, h]. + Set to 1 means [x1, y1 ,x2, y2]. + + Returns: + tuple[Tensor, Tensor]: (dets, labels), `dets` of shape [N, num_det, 5] + and `labels` of shape [N, num_det]. + """ + boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2) + _, det_boxes, det_scores, labels = TRTEfficientNMSop.apply( + boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0, + score_threshold) + dets = torch.cat([det_boxes, det_scores.unsqueeze(2)], -1) + + # retain shape info + batch_size = boxes.size(0) + + dets_shape = dets.shape + label_shape = labels.shape + dets = dets.reshape([batch_size, *dets_shape[1:]]) + labels = labels.reshape([batch_size, *label_shape[1:]]) + return dets, labels + + +@mark('efficient_nms', inputs=['boxes', 'scores'], outputs=['dets', 'labels']) +def efficient_nms(*args, **kwargs): + """Wrapper function for `_efficient_nms`.""" + return _efficient_nms(*args, **kwargs) + + +class TRTEfficientNMSop(torch.autograd.Function): + """Efficient NMS op for TensorRT.""" + + @staticmethod + def forward( + ctx, + boxes, + scores, + background_class=-1, + box_coding=0, + iou_threshold=0.45, + max_output_boxes=100, + plugin_version='1', + score_activation=0, + score_threshold=0.25, + ): + """Forward function of TRTEfficientNMSop.""" + batch_size, num_boxes, num_classes = scores.shape + num_det = torch.randint( + 0, max_output_boxes, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, max_output_boxes, 4) + det_scores = torch.randn(batch_size, max_output_boxes) + det_classes = torch.randint( + 0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic(g, + boxes, + scores, + background_class=-1, + box_coding=0, + iou_threshold=0.45, + max_output_boxes=100, + plugin_version='1', + score_activation=0, + score_threshold=0.25): + """Symbolic function of TRTEfficientNMSop.""" + out = g.op( + 'TRT::EfficientNMS_TRT', + boxes, + scores, + background_class_i=background_class, + box_coding_i=box_coding, + iou_threshold_f=iou_threshold, + max_output_boxes_i=max_output_boxes, + plugin_version_s=plugin_version, + score_activation_i=score_activation, + score_threshold_f=score_threshold, + outputs=4) + nums, boxes, scores, classes = out + return nums, boxes, scores, classes diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/object_detection.py b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/object_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..7efdfcfb7a46c8bc6b90e76bd06d9065410e55f0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/deploy/object_detection.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Callable, Dict, Optional + +import torch +from mmdeploy.codebase.base import CODEBASE, MMCodebase +from mmdeploy.codebase.mmdet.deploy import ObjectDetection +from mmdeploy.utils import Codebase, Task +from mmengine import Config +from mmengine.registry import Registry + +MMYOLO_TASK = Registry('mmyolo_tasks') + + +@CODEBASE.register_module(Codebase.MMYOLO.value) +class MMYOLO(MMCodebase): + """MMYOLO codebase class.""" + + task_registry = MMYOLO_TASK + + @classmethod + def register_deploy_modules(cls): + """register all rewriters for mmdet.""" + import mmdeploy.codebase.mmdet.models # noqa: F401 + import mmdeploy.codebase.mmdet.ops # noqa: F401 + import mmdeploy.codebase.mmdet.structures # noqa: F401 + + @classmethod + def register_all_modules(cls): + """register all modules.""" + from mmdet.utils.setup_env import \ + register_all_modules as register_all_modules_mmdet + + from mmyolo.utils.setup_env import \ + register_all_modules as register_all_modules_mmyolo + + cls.register_deploy_modules() + register_all_modules_mmyolo(True) + register_all_modules_mmdet(False) + + +def _get_dataset_metainfo(model_cfg: Config): + """Get metainfo of dataset. + + Args: + model_cfg Config: Input model Config object. + + Returns: + list[str]: A list of string specifying names of different class. + """ + from mmyolo import datasets # noqa + from mmyolo.registry import DATASETS + + module_dict = DATASETS.module_dict + for dataloader_name in [ + 'test_dataloader', 'val_dataloader', 'train_dataloader' + ]: + if dataloader_name not in model_cfg: + continue + dataloader_cfg = model_cfg[dataloader_name] + dataset_cfg = dataloader_cfg.dataset + dataset_cls = module_dict.get(dataset_cfg.type, None) + if dataset_cls is None: + continue + if hasattr(dataset_cls, '_load_metainfo') and isinstance( + dataset_cls._load_metainfo, Callable): + meta = dataset_cls._load_metainfo( + dataset_cfg.get('metainfo', None)) + if meta is not None: + return meta + if hasattr(dataset_cls, 'METAINFO'): + return dataset_cls.METAINFO + + return None + + +@MMYOLO_TASK.register_module(Task.OBJECT_DETECTION.value) +class YOLOObjectDetection(ObjectDetection): + """YOLO Object Detection task.""" + + def get_visualizer(self, name: str, save_dir: str): + """Get visualizer. + + Args: + name (str): Name of visualizer. + save_dir (str): Directory to save visualization results. + + Returns: + Visualizer: A visualizer instance. + """ + from mmdet.visualization import DetLocalVisualizer # noqa: F401,F403 + metainfo = _get_dataset_metainfo(self.model_cfg) + visualizer = super().get_visualizer(name, save_dir) + if metainfo is not None: + visualizer.dataset_meta = metainfo + return visualizer + + def build_pytorch_model(self, + model_checkpoint: Optional[str] = None, + cfg_options: Optional[Dict] = None, + **kwargs) -> torch.nn.Module: + """Initialize torch model. + + Args: + model_checkpoint (str): The checkpoint file of torch model, + defaults to `None`. + cfg_options (dict): Optional config key-pair parameters. + Returns: + nn.Module: An initialized torch model generated by other OpenMMLab + codebases. + """ + from copy import deepcopy + + from mmengine.model import revert_sync_batchnorm + from mmengine.registry import MODELS + + from mmyolo.utils import switch_to_deploy + + model = deepcopy(self.model_cfg.model) + preprocess_cfg = deepcopy(self.model_cfg.get('preprocess_cfg', {})) + preprocess_cfg.update( + deepcopy(self.model_cfg.get('data_preprocessor', {}))) + model.setdefault('data_preprocessor', preprocess_cfg) + model = MODELS.build(model) + if model_checkpoint is not None: + from mmengine.runner.checkpoint import load_checkpoint + load_checkpoint(model, model_checkpoint, map_location=self.device) + + model = revert_sync_batchnorm(model) + switch_to_deploy(model) + model = model.to(self.device) + model.eval() + return model diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/engine/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b2e0a126c09797b327f7309d6e980245b7e44773 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hooks import * # noqa: F401,F403 +from .optimizers import * # noqa: F401,F403 diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b8deebc8827da5b9a3f8c92a2fffe70e42d0bfa --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ppyoloe_param_scheduler_hook import PPYOLOEParamSchedulerHook +from .switch_to_deploy_hook import SwitchToDeployHook +from .yolov5_param_scheduler_hook import YOLOv5ParamSchedulerHook +from .yolox_mode_switch_hook import YOLOXModeSwitchHook + +__all__ = [ + 'YOLOv5ParamSchedulerHook', 'YOLOXModeSwitchHook', 'SwitchToDeployHook', + 'PPYOLOEParamSchedulerHook' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..26dfe6ef2d5cf590ea381efb3e42cdc1c5492361 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +from mmengine.hooks import ParamSchedulerHook +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS + + +@HOOKS.register_module() +class PPYOLOEParamSchedulerHook(ParamSchedulerHook): + """A hook to update learning rate and momentum in optimizer of PPYOLOE. We + use this hook to implement adaptive computation for `warmup_total_iters`, + which is not possible with the built-in ParamScheduler in mmyolo. + + Args: + warmup_min_iter (int): Minimum warmup iters. Defaults to 1000. + start_factor (float): The number we multiply learning rate in the + first epoch. The multiplication factor changes towards end_factor + in the following epochs. Defaults to 0. + warmup_epochs (int): Epochs for warmup. Defaults to 5. + min_lr_ratio (float): Minimum learning rate ratio. + total_epochs (int): In PPYOLOE, `total_epochs` is set to + training_epochs x 1.2. Defaults to 360. + """ + priority = 9 + + def __init__(self, + warmup_min_iter: int = 1000, + start_factor: float = 0., + warmup_epochs: int = 5, + min_lr_ratio: float = 0.0, + total_epochs: int = 360): + + self.warmup_min_iter = warmup_min_iter + self.start_factor = start_factor + self.warmup_epochs = warmup_epochs + self.min_lr_ratio = min_lr_ratio + self.total_epochs = total_epochs + + self._warmup_end = False + self._base_lr = None + + def before_train(self, runner: Runner): + """Operations before train. + + Args: + runner (Runner): The runner of the training process. + """ + optimizer = runner.optim_wrapper.optimizer + for group in optimizer.param_groups: + # If the param is never be scheduled, record the current value + # as the initial value. + group.setdefault('initial_lr', group['lr']) + + self._base_lr = [ + group['initial_lr'] for group in optimizer.param_groups + ] + self._min_lr = [i * self.min_lr_ratio for i in self._base_lr] + + def before_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None): + """Operations before each training iteration. + + Args: + runner (Runner): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict or tuple or list, optional): Data from dataloader. + """ + cur_iters = runner.iter + optimizer = runner.optim_wrapper.optimizer + dataloader_len = len(runner.train_dataloader) + + # The minimum warmup is self.warmup_min_iter + warmup_total_iters = max( + round(self.warmup_epochs * dataloader_len), self.warmup_min_iter) + + if cur_iters <= warmup_total_iters: + # warm up + alpha = cur_iters / warmup_total_iters + factor = self.start_factor * (1 - alpha) + alpha + + for group_idx, param in enumerate(optimizer.param_groups): + param['lr'] = self._base_lr[group_idx] * factor + else: + for group_idx, param in enumerate(optimizer.param_groups): + total_iters = self.total_epochs * dataloader_len + lr = self._min_lr[group_idx] + ( + self._base_lr[group_idx] - + self._min_lr[group_idx]) * 0.5 * ( + math.cos((cur_iters - warmup_total_iters) * math.pi / + (total_iters - warmup_total_iters)) + 1.0) + param['lr'] = lr diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/switch_to_deploy_hook.py b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/switch_to_deploy_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..28ac345f40c44c974fb33b7bf9756a61fcabf820 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/switch_to_deploy_hook.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmengine.hooks import Hook +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS +from mmyolo.utils import switch_to_deploy + + +@HOOKS.register_module() +class SwitchToDeployHook(Hook): + """Switch to deploy mode before testing. + + This hook converts the multi-channel structure of the training network + (high performance) to the one-way structure of the testing network (fast + speed and memory saving). + """ + + def before_test_epoch(self, runner: Runner): + """Switch to deploy mode before testing.""" + switch_to_deploy(runner.model) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..777bb49d7abd7fc37385370546d05e70c274b3b7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import numpy as np +from mmengine.hooks import ParamSchedulerHook +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS + + +def linear_fn(lr_factor: float, max_epochs: int): + """Generate linear function.""" + return lambda x: (1 - x / max_epochs) * (1.0 - lr_factor) + lr_factor + + +def cosine_fn(lr_factor: float, max_epochs: int): + """Generate cosine function.""" + return lambda x: ( + (1 - math.cos(x * math.pi / max_epochs)) / 2) * (lr_factor - 1) + 1 + + +@HOOKS.register_module() +class YOLOv5ParamSchedulerHook(ParamSchedulerHook): + """A hook to update learning rate and momentum in optimizer of YOLOv5.""" + priority = 9 + + scheduler_maps = {'linear': linear_fn, 'cosine': cosine_fn} + + def __init__(self, + scheduler_type: str = 'linear', + lr_factor: float = 0.01, + max_epochs: int = 300, + warmup_epochs: int = 3, + warmup_bias_lr: float = 0.1, + warmup_momentum: float = 0.8, + warmup_mim_iter: int = 1000, + **kwargs): + + assert scheduler_type in self.scheduler_maps + + self.warmup_epochs = warmup_epochs + self.warmup_bias_lr = warmup_bias_lr + self.warmup_momentum = warmup_momentum + self.warmup_mim_iter = warmup_mim_iter + + kwargs.update({'lr_factor': lr_factor, 'max_epochs': max_epochs}) + self.scheduler_fn = self.scheduler_maps[scheduler_type](**kwargs) + + self._warmup_end = False + self._base_lr = None + self._base_momentum = None + + def before_train(self, runner: Runner): + """Operations before train. + + Args: + runner (Runner): The runner of the training process. + """ + optimizer = runner.optim_wrapper.optimizer + for group in optimizer.param_groups: + # If the param is never be scheduled, record the current value + # as the initial value. + group.setdefault('initial_lr', group['lr']) + group.setdefault('initial_momentum', group.get('momentum', -1)) + + self._base_lr = [ + group['initial_lr'] for group in optimizer.param_groups + ] + self._base_momentum = [ + group['initial_momentum'] for group in optimizer.param_groups + ] + + def before_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None): + """Operations before each training iteration. + + Args: + runner (Runner): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict or tuple or list, optional): Data from dataloader. + """ + cur_iters = runner.iter + cur_epoch = runner.epoch + optimizer = runner.optim_wrapper.optimizer + + # The minimum warmup is self.warmup_mim_iter + warmup_total_iters = max( + round(self.warmup_epochs * len(runner.train_dataloader)), + self.warmup_mim_iter) + + if cur_iters <= warmup_total_iters: + xp = [0, warmup_total_iters] + for group_idx, param in enumerate(optimizer.param_groups): + if group_idx == 2: + # bias learning rate will be handled specially + yp = [ + self.warmup_bias_lr, + self._base_lr[group_idx] * self.scheduler_fn(cur_epoch) + ] + else: + yp = [ + 0.0, + self._base_lr[group_idx] * self.scheduler_fn(cur_epoch) + ] + param['lr'] = np.interp(cur_iters, xp, yp) + + if 'momentum' in param: + param['momentum'] = np.interp( + cur_iters, xp, + [self.warmup_momentum, self._base_momentum[group_idx]]) + else: + self._warmup_end = True + + def after_train_epoch(self, runner: Runner): + """Operations after each training epoch. + + Args: + runner (Runner): The runner of the training process. + """ + if not self._warmup_end: + return + + cur_epoch = runner.epoch + optimizer = runner.optim_wrapper.optimizer + for group_idx, param in enumerate(optimizer.param_groups): + param['lr'] = self._base_lr[group_idx] * self.scheduler_fn( + cur_epoch) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/yolox_mode_switch_hook.py b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/yolox_mode_switch_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..27711768c3f89b26410ae1373bc920d0bfded603 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/hooks/yolox_mode_switch_hook.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Sequence + +from mmengine.hooks import Hook +from mmengine.model import is_model_wrapper +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS + + +@HOOKS.register_module() +class YOLOXModeSwitchHook(Hook): + """Switch the mode of YOLOX during training. + + This hook turns off the mosaic and mixup data augmentation and switches + to use L1 loss in bbox_head. + + Args: + num_last_epochs (int): The number of latter epochs in the end of the + training to close the data augmentation and switch to L1 loss. + Defaults to 15. + """ + + def __init__(self, + num_last_epochs: int = 15, + new_train_pipeline: Sequence[dict] = None): + self.num_last_epochs = num_last_epochs + self.new_train_pipeline_cfg = new_train_pipeline + + def before_train_epoch(self, runner: Runner): + """Close mosaic and mixup augmentation and switches to use L1 loss.""" + epoch = runner.epoch + model = runner.model + if is_model_wrapper(model): + model = model.module + + if (epoch + 1) == runner.max_epochs - self.num_last_epochs: + runner.logger.info(f'New Pipeline: {self.new_train_pipeline_cfg}') + + train_dataloader_cfg = copy.deepcopy(runner.cfg.train_dataloader) + train_dataloader_cfg.dataset.pipeline = self.new_train_pipeline_cfg + # Note: Why rebuild the dataset? + # When build_dataloader will make a deep copy of the dataset, + # it will lead to potential risks, such as the global instance + # object FileClient data is disordered. + # This problem needs to be solved in the future. + new_train_dataloader = Runner.build_dataloader( + train_dataloader_cfg) + runner.train_loop.dataloader = new_train_dataloader + + runner.logger.info('recreate the dataloader!') + runner.logger.info('Add additional bbox reg loss now!') + model.bbox_head.use_bbox_aux = True diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/engine/optimizers/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b598020d05db54cdc1d803d39ebd2c91026a6112 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/optimizers/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .yolov5_optim_constructor import YOLOv5OptimizerConstructor +from .yolov7_optim_wrapper_constructor import YOLOv7OptimWrapperConstructor + +__all__ = ['YOLOv5OptimizerConstructor', 'YOLOv7OptimWrapperConstructor'] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/engine/optimizers/yolov5_optim_constructor.py b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/optimizers/yolov5_optim_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..5e5f42cb5c2c18962f989288b45011c742845c2f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/optimizers/yolov5_optim_constructor.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +from mmengine.dist import get_world_size +from mmengine.logging import print_log +from mmengine.model import is_model_wrapper +from mmengine.optim import OptimWrapper + +from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, + OPTIMIZERS) + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class YOLOv5OptimizerConstructor: + """YOLOv5 constructor for optimizers. + + It has the following functions: + + - divides the optimizer parameters into 3 groups: + Conv, Bias and BN + + - support `weight_decay` parameter adaption based on + `batch_size_per_gpu` + + Args: + optim_wrapper_cfg (dict): The config dict of the optimizer wrapper. + Positional fields are + + - ``type``: class name of the OptimizerWrapper + - ``optimizer``: The configuration of optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer wrapper type, + e.g., accumulative_counts, clip_grad, etc. + + The positional fields of ``optimizer`` are + + - `type`: class name of the optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer type, e.g., + lr, weight_decay, momentum, etc. + + paramwise_cfg (dict, optional): Parameter-wise options. Must include + `base_total_batch_size` if not None. If the total input batch + is smaller than `base_total_batch_size`, the `weight_decay` + parameter will be kept unchanged, otherwise linear scaling. + + Example: + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optim_wrapper_cfg = dict( + >>> dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01, + >>> momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16)) + >>> paramwise_cfg = dict(base_total_batch_size=64) + >>> optim_wrapper_builder = YOLOv5OptimizerConstructor( + >>> optim_wrapper_cfg, paramwise_cfg) + >>> optim_wrapper = optim_wrapper_builder(model) + """ + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None): + if paramwise_cfg is None: + paramwise_cfg = {'base_total_batch_size': 64} + assert 'base_total_batch_size' in paramwise_cfg + + if not isinstance(optim_wrapper_cfg, dict): + raise TypeError('optimizer_cfg should be a dict', + f'but got {type(optim_wrapper_cfg)}') + assert 'optimizer' in optim_wrapper_cfg, ( + '`optim_wrapper_cfg` must contain "optimizer" config') + + self.optim_wrapper_cfg = optim_wrapper_cfg + self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer') + self.base_total_batch_size = paramwise_cfg['base_total_batch_size'] + + def __call__(self, model: nn.Module) -> OptimWrapper: + if is_model_wrapper(model): + model = model.module + optimizer_cfg = self.optimizer_cfg.copy() + weight_decay = optimizer_cfg.pop('weight_decay', 0) + + if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + + if scale_factor != 1: + weight_decay *= scale_factor + print_log(f'Scaled weight_decay to {weight_decay}', 'current') + + params_groups = [], [], [] + + for v in model.modules(): + if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): + params_groups[2].append(v.bias) + # Includes SyncBatchNorm + if isinstance(v, nn.modules.batchnorm._NormBase): + params_groups[1].append(v.weight) + elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): + params_groups[0].append(v.weight) + + # Note: Make sure bias is in the last parameter group + optimizer_cfg['params'] = [] + # conv + optimizer_cfg['params'].append({ + 'params': params_groups[0], + 'weight_decay': weight_decay + }) + # bn + optimizer_cfg['params'].append({'params': params_groups[1]}) + # bias + optimizer_cfg['params'].append({'params': params_groups[2]}) + + print_log( + 'Optimizer groups: %g .bias, %g conv.weight, %g other' % + (len(params_groups[2]), len(params_groups[0]), len( + params_groups[1])), 'current') + del params_groups + + optimizer = OPTIMIZERS.build(optimizer_cfg) + optim_wrapper = OPTIM_WRAPPERS.build( + self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer)) + return optim_wrapper diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..79ea8b69976760c0e45e35f8420d0cc69b13331a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py @@ -0,0 +1,139 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +from mmengine.dist import get_world_size +from mmengine.logging import print_log +from mmengine.model import is_model_wrapper +from mmengine.optim import OptimWrapper + +from mmyolo.models.dense_heads.yolov7_head import ImplicitA, ImplicitM +from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, + OPTIMIZERS) + + +# TODO: Consider merging into YOLOv5OptimizerConstructor +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class YOLOv7OptimWrapperConstructor: + """YOLOv7 constructor for optimizer wrappers. + + It has the following functions: + + - divides the optimizer parameters into 3 groups: + Conv, Bias and BN/ImplicitA/ImplicitM + + - support `weight_decay` parameter adaption based on + `batch_size_per_gpu` + + Args: + optim_wrapper_cfg (dict): The config dict of the optimizer wrapper. + Positional fields are + + - ``type``: class name of the OptimizerWrapper + - ``optimizer``: The configuration of optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer wrapper type, + e.g., accumulative_counts, clip_grad, etc. + + The positional fields of ``optimizer`` are + + - `type`: class name of the optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer type, e.g., + lr, weight_decay, momentum, etc. + + paramwise_cfg (dict, optional): Parameter-wise options. Must include + `base_total_batch_size` if not None. If the total input batch + is smaller than `base_total_batch_size`, the `weight_decay` + parameter will be kept unchanged, otherwise linear scaling. + + Example: + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optim_wrapper_cfg = dict( + >>> dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01, + >>> momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16)) + >>> paramwise_cfg = dict(base_total_batch_size=64) + >>> optim_wrapper_builder = YOLOv7OptimWrapperConstructor( + >>> optim_wrapper_cfg, paramwise_cfg) + >>> optim_wrapper = optim_wrapper_builder(model) + """ + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None): + if paramwise_cfg is None: + paramwise_cfg = {'base_total_batch_size': 64} + assert 'base_total_batch_size' in paramwise_cfg + + if not isinstance(optim_wrapper_cfg, dict): + raise TypeError('optimizer_cfg should be a dict', + f'but got {type(optim_wrapper_cfg)}') + assert 'optimizer' in optim_wrapper_cfg, ( + '`optim_wrapper_cfg` must contain "optimizer" config') + + self.optim_wrapper_cfg = optim_wrapper_cfg + self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer') + self.base_total_batch_size = paramwise_cfg['base_total_batch_size'] + + def __call__(self, model: nn.Module) -> OptimWrapper: + if is_model_wrapper(model): + model = model.module + optimizer_cfg = self.optimizer_cfg.copy() + weight_decay = optimizer_cfg.pop('weight_decay', 0) + + if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + + if scale_factor != 1: + weight_decay *= scale_factor + print_log(f'Scaled weight_decay to {weight_decay}', 'current') + + params_groups = [], [], [] + for v in model.modules(): + # no decay + # Caution: Coupling with model + if isinstance(v, (ImplicitA, ImplicitM)): + params_groups[0].append(v.implicit) + elif isinstance(v, nn.modules.batchnorm._NormBase): + params_groups[0].append(v.weight) + # apply decay + elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): + params_groups[1].append(v.weight) # apply decay + + # biases, no decay + if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): + params_groups[2].append(v.bias) + + # Note: Make sure bias is in the last parameter group + optimizer_cfg['params'] = [] + # conv + optimizer_cfg['params'].append({ + 'params': params_groups[1], + 'weight_decay': weight_decay + }) + # bn ... + optimizer_cfg['params'].append({'params': params_groups[0]}) + # bias + optimizer_cfg['params'].append({'params': params_groups[2]}) + + print_log( + 'Optimizer groups: %g .bias, %g conv.weight, %g other' % + (len(params_groups[2]), len(params_groups[1]), len( + params_groups[0])), 'current') + del params_groups + + optimizer = OPTIMIZERS.build(optimizer_cfg) + optim_wrapper = OPTIM_WRAPPERS.build( + self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer)) + return optim_wrapper diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..51c37f0436f131dcd26b9a8115e58fe49d59207e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backbones import * # noqa: F401,F403 +from .data_preprocessors import * # noqa: F401,F403 +from .dense_heads import * # noqa: F401,F403 +from .detectors import * # noqa: F401,F403 +from .layers import * # noqa: F401,F403 +from .losses import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .plugins import * # noqa: F401,F403 +from .task_modules import * # noqa: F401,F403 diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..48c8e28b1e7eb97e3f7cb064c75af0dc79b4cc8d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_backbone import BaseBackbone +from .csp_darknet import YOLOv5CSPDarknet, YOLOv8CSPDarknet, YOLOXCSPDarknet +from .csp_resnet import PPYOLOECSPResNet +from .cspnext import CSPNeXt +from .efficient_rep import YOLOv6CSPBep, YOLOv6EfficientRep +from .yolov7_backbone import YOLOv7Backbone + +__all__ = [ + 'YOLOv5CSPDarknet', 'BaseBackbone', 'YOLOv6EfficientRep', 'YOLOv6CSPBep', + 'YOLOXCSPDarknet', 'CSPNeXt', 'YOLOv7Backbone', 'PPYOLOECSPResNet', + 'YOLOv8CSPDarknet' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/base_backbone.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/base_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..730c7095eccf66b0d563fad96122454c98dff0ac --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/base_backbone.py @@ -0,0 +1,225 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import List, Sequence, Union + +import torch +import torch.nn as nn +from mmcv.cnn import build_plugin_layer +from mmdet.utils import ConfigType, OptMultiConfig +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class BaseBackbone(BaseModule, metaclass=ABCMeta): + """BaseBackbone backbone used in YOLO series. + + .. code:: text + + Backbone model structure diagram + +-----------+ + | input | + +-----------+ + v + +-----------+ + | stem | + | layer | + +-----------+ + v + +-----------+ + | stage | + | layer 1 | + +-----------+ + v + +-----------+ + | stage | + | layer 2 | + +-----------+ + v + ...... + v + +-----------+ + | stage | + | layer n | + +-----------+ + In P5 model, n=4 + In P6 model, n=5 + + Args: + arch_setting (list): Architecture of BaseBackbone. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels: Number of input image channels. Defaults to 3. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to None. + act_cfg (dict): Config dict for activation layer. + Defaults to None. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + arch_setting: list, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Sequence[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + norm_cfg: ConfigType = None, + act_cfg: ConfigType = None, + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + self.num_stages = len(arch_setting) + self.arch_setting = arch_setting + + assert set(out_indices).issubset( + i for i in range(len(arch_setting) + 1)) + + if frozen_stages not in range(-1, len(arch_setting) + 1): + raise ValueError('"frozen_stages" must be in range(-1, ' + 'len(arch_setting) + 1). But received ' + f'{frozen_stages}') + + self.input_channels = input_channels + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.widen_factor = widen_factor + self.deepen_factor = deepen_factor + self.norm_eval = norm_eval + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.plugins = plugins + + self.stem = self.build_stem_layer() + self.layers = ['stem'] + + for idx, setting in enumerate(arch_setting): + stage = [] + stage += self.build_stage_layer(idx, setting) + if plugins is not None: + stage += self.make_stage_plugins(plugins, idx, setting) + self.add_module(f'stage{idx + 1}', nn.Sequential(*stage)) + self.layers.append(f'stage{idx + 1}') + + @abstractmethod + def build_stem_layer(self): + """Build a stem layer.""" + pass + + @abstractmethod + def build_stage_layer(self, stage_idx: int, setting: list): + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + pass + + def make_stage_plugins(self, plugins, stage_idx, setting): + """Make plugins for backbone ``stage_idx`` th stage. + + Currently we support to insert ``context_block``, + ``empirical_attention_block``, ``nonlocal_block``, ``dropout_block`` + into the backbone. + + + An example of plugins format could be: + + Examples: + >>> plugins=[ + ... dict(cfg=dict(type='xxx', arg1='xxx'), + ... stages=(False, True, True, True)), + ... dict(cfg=dict(type='yyy'), + ... stages=(True, True, True, True)), + ... ] + >>> model = YOLOv5CSPDarknet() + >>> stage_plugins = model.make_stage_plugins(plugins, 0, setting) + >>> assert len(stage_plugins) == 1 + + Suppose ``stage_idx=0``, the structure of blocks in the stage would be: + + .. code-block:: none + + conv1 -> conv2 -> conv3 -> yyy + + Suppose ``stage_idx=1``, the structure of blocks in the stage would be: + + .. code-block:: none + + conv1 -> conv2 -> conv3 -> xxx -> yyy + + + Args: + plugins (list[dict]): List of plugins cfg to build. The postfix is + required if multiple same type plugins are inserted. + stage_idx (int): Index of stage to build + If stages is missing, the plugin would be applied to all + stages. + setting (list): The architecture setting of a stage layer. + + Returns: + list[nn.Module]: Plugins for current stage + """ + # TODO: It is not general enough to support any channel and needs + # to be refactored + in_channels = int(setting[1] * self.widen_factor) + plugin_layers = [] + for plugin in plugins: + plugin = plugin.copy() + stages = plugin.pop('stages', None) + assert stages is None or len(stages) == self.num_stages + if stages is None or stages[stage_idx]: + name, layer = build_plugin_layer( + plugin['cfg'], in_channels=in_channels) + plugin_layers.append(layer) + return plugin_layers + + def _freeze_stages(self): + """Freeze the parameters of the specified stage so that they are no + longer updated.""" + if self.frozen_stages >= 0: + for i in range(self.frozen_stages + 1): + m = getattr(self, self.layers[i]) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode: bool = True): + """Convert the model into training mode while keep normalization layer + frozen.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + def forward(self, x: torch.Tensor) -> tuple: + """Forward batch_inputs from the data_preprocessor.""" + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + + return tuple(outs) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/csp_darknet.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/csp_darknet.py new file mode 100644 index 0000000000000000000000000000000000000000..92bd69a5a9378a37ed8fb50c52dfba0de6879083 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/csp_darknet.py @@ -0,0 +1,427 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer, Focus +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import CSPLayerWithTwoConv, SPPFBottleneck +from ..utils import make_divisible, make_round +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class YOLOv5CSPDarknet(BaseBackbone): + """CSP-Darknet backbone used in YOLOv5. + Args: + arch (str): Architecture of CSP-Darknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to: 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (Union[dict,list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOv5CSPDarknet + >>> import torch + >>> model = YOLOv5CSPDarknet() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 1024, 3, True, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 768, 3, True, False], + [768, 1024, 3, True, True]] + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + return ConvModule( + self.input_channels, + make_divisible(self.arch_setting[0][0], self.widen_factor), + kernel_size=6, + stride=2, + padding=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = make_divisible(in_channels, self.widen_factor) + out_channels = make_divisible(out_channels, self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + stage = [] + conv_layer = ConvModule( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage + + def init_weights(self): + """Initialize the parameters.""" + if self.init_cfg is None: + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOv8CSPDarknet(BaseBackbone): + """CSP-Darknet backbone used in YOLOv8. + + Args: + arch (str): Architecture of CSP-Darknet, from {P5}. + Defaults to P5. + last_stage_out_channels (int): Final layer output channel. + Defaults to 1024. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to: 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (Union[dict,list[dict]], optional): Initialization config + dict. Defaults to None. + + Example: + >>> from mmyolo.models import YOLOv8CSPDarknet + >>> import torch + >>> model = YOLOv8CSPDarknet() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + # the final out_channels will be set according to the param. + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, None, 3, True, True]], + } + + def __init__(self, + arch: str = 'P5', + last_stage_out_channels: int = 1024, + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + self.arch_settings[arch][-1][1] = last_stage_out_channels + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + return ConvModule( + self.input_channels, + make_divisible(self.arch_setting[0][0], self.widen_factor), + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = make_divisible(in_channels, self.widen_factor) + out_channels = make_divisible(out_channels, self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + stage = [] + conv_layer = ConvModule( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + csp_layer = CSPLayerWithTwoConv( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage + + def init_weights(self): + """Initialize the parameters.""" + if self.init_cfg is None: + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOXCSPDarknet(BaseBackbone): + """CSP-Darknet backbone used in YOLOX. + + Args: + arch (str): Architecture of CSP-Darknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP + layers. Defaults to (5, 9, 13). + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (Union[dict,list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOXCSPDarknet + >>> import torch + >>> model = YOLOXCSPDarknet() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False], + [256, 512, 9, True, False], [512, 1024, 3, False, True]], + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + use_depthwise: bool = False, + spp_kernal_sizes: Tuple[int] = (5, 9, 13), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + self.use_depthwise = use_depthwise + self.spp_kernal_sizes = spp_kernal_sizes + super().__init__(self.arch_settings[arch], deepen_factor, widen_factor, + input_channels, out_indices, frozen_stages, plugins, + norm_cfg, act_cfg, norm_eval, init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + return Focus( + 3, + make_divisible(64, self.widen_factor), + kernel_size=3, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = make_divisible(in_channels, self.widen_factor) + out_channels = make_divisible(out_channels, self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + stage = [] + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + conv_layer = conv( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=self.spp_kernal_sizes, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + return stage diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/csp_resnet.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/csp_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..a42ed489d8872913f4aacce08497c8e48fdace49 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/csp_resnet.py @@ -0,0 +1,169 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.models.backbones import BaseBackbone +from mmyolo.models.layers.yolo_bricks import CSPResLayer +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class PPYOLOECSPResNet(BaseBackbone): + """CSP-ResNet backbone used in PPYOLOE. + + Args: + arch (str): Architecture of CSPNeXt, from {P5, P6}. + Defaults to P5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + arch_ovewrite (list): Overwrite default arch settings. + Defaults to None. + block_cfg (dict): Config dict for block. Defaults to + dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True) + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', momentum=0.1, + eps=1e-5). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + attention_cfg (dict): Config dict for `EffectiveSELayer`. + Defaults to dict(type='EffectiveSELayer', + act_cfg=dict(type='HSigmoid')). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + use_large_stem (bool): Whether to use large stem layer. + Defaults to False. + """ + # From left to right: + # in_channels, out_channels, num_blocks + arch_settings = { + 'P5': [[64, 128, 3], [128, 256, 6], [256, 512, 6], [512, 1024, 3]] + } + + def __init__(self, + arch: str = 'P5', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + arch_ovewrite: dict = None, + block_cfg: ConfigType = dict( + type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + attention_cfg: ConfigType = dict( + type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None, + use_large_stem: bool = False): + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + arch_setting = [[ + int(in_channels * widen_factor), + int(out_channels * widen_factor), + round(num_blocks * deepen_factor) + ] for in_channels, out_channels, num_blocks in arch_setting] + self.block_cfg = block_cfg + self.use_large_stem = use_large_stem + self.attention_cfg = attention_cfg + + super().__init__( + arch_setting, + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + if self.use_large_stem: + stem = nn.Sequential( + ConvModule( + self.input_channels, + self.arch_setting[0][0] // 2, + 3, + stride=2, + padding=1, + act_cfg=self.act_cfg, + norm_cfg=self.norm_cfg), + ConvModule( + self.arch_setting[0][0] // 2, + self.arch_setting[0][0] // 2, + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + self.arch_setting[0][0] // 2, + self.arch_setting[0][0], + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + else: + stem = nn.Sequential( + ConvModule( + self.input_channels, + self.arch_setting[0][0] // 2, + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + self.arch_setting[0][0] // 2, + self.arch_setting[0][0], + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return stem + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks = setting + + cspres_layer = CSPResLayer( + in_channels=in_channels, + out_channels=out_channels, + num_block=num_blocks, + block_cfg=self.block_cfg, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=self.attention_cfg, + use_spp=False) + return [cspres_layer] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/cspnext.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/cspnext.py new file mode 100644 index 0000000000000000000000000000000000000000..adca9dd9d11baecefda90a99a4188e78c2ca8188 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/cspnext.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Sequence, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import SPPFBottleneck +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class CSPNeXt(BaseBackbone): + """CSPNeXt backbone used in RTMDet. + + Args: + arch (str): Architecture of CSPNeXt, from {P5, P6}. + Defaults to P5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin.Defaults to + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + arch_ovewrite (list): Overwrite default arch settings. + Defaults to None. + channel_attention (bool): Whether to add channel attention in each + stage. Defaults to True. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', requires_grad=True). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 1024, 3, False, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 768, 3, True, False], + [768, 1024, 3, False, True]] + } + + def __init__( + self, + arch: str = 'P5', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Sequence[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + use_depthwise: bool = False, + expand_ratio: float = 0.5, + arch_ovewrite: dict = None, + channel_attention: bool = True, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + self.channel_attention = channel_attention + self.use_depthwise = use_depthwise + self.conv = DepthwiseSeparableConvModule \ + if use_depthwise else ConvModule + self.expand_ratio = expand_ratio + self.conv_cfg = conv_cfg + + super().__init__( + arch_setting, + deepen_factor, + widen_factor, + input_channels, + out_indices, + frozen_stages=frozen_stages, + plugins=plugins, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + stem = nn.Sequential( + ConvModule( + 3, + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return stem + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + num_blocks = max(round(num_blocks * self.deepen_factor), 1) + + stage = [] + conv_layer = self.conv( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=5, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + use_depthwise=self.use_depthwise, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + channel_attention=self.channel_attention, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + return stage diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/efficient_rep.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/efficient_rep.py new file mode 100644 index 0000000000000000000000000000000000000000..32e455f06972af148fa56bba1c4178b0e2d540bd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/efficient_rep.py @@ -0,0 +1,305 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.models.layers.yolo_bricks import CSPSPPFBottleneck, SPPFBottleneck +from mmyolo.registry import MODELS +from ..layers import BepC3StageBlock, RepStageBlock +from ..utils import make_round +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class YOLOv6EfficientRep(BaseBackbone): + """EfficientRep backbone used in YOLOv6. + Args: + arch (str): Architecture of BaseDarknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + init_cfg (Union[dict, list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOv6EfficientRep + >>> import torch + >>> model = YOLOv6EfficientRep() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, use_spp + arch_settings = { + 'P5': [[64, 128, 6, False], [128, 256, 12, False], + [256, 512, 18, False], [512, 1024, 6, True]] + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + use_cspsppf: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + norm_eval: bool = False, + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.block_cfg = block_cfg + self.use_cspsppf = use_cspsppf + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict( + in_channels=self.input_channels, + out_channels=int(self.arch_setting[0][0] * self.widen_factor), + kernel_size=3, + stride=2, + )) + return MODELS.build(block_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, use_spp = setting + + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + + rep_stage_block = RepStageBlock( + in_channels=out_channels, + out_channels=out_channels, + num_blocks=num_blocks, + block_cfg=self.block_cfg, + ) + + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=2)) + stage = [] + + ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block) + + stage.append(ef_block) + + if use_spp: + spp = SPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if self.use_cspsppf: + spp = CSPSPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage + + def init_weights(self): + if self.init_cfg is None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOv6CSPBep(YOLOv6EfficientRep): + """CSPBep backbone used in YOLOv6. + Args: + arch (str): Architecture of BaseDarknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + block_act_cfg (dict): Config dict for activation layer used in each + stage. Defaults to dict(type='SiLU', inplace=True). + init_cfg (Union[dict, list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOv6CSPBep + >>> import torch + >>> model = YOLOv6CSPBep() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, use_spp + arch_settings = { + 'P5': [[64, 128, 6, False], [128, 256, 12, False], + [256, 512, 18, False], [512, 1024, 6, True]] + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + hidden_ratio: float = 0.5, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + use_cspsppf: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + block_cfg: ConfigType = dict(type='ConvWrapper'), + init_cfg: OptMultiConfig = None): + self.hidden_ratio = hidden_ratio + self.use_cspsppf = use_cspsppf + super().__init__( + arch=arch, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, use_spp = setting + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + + rep_stage_block = BepC3StageBlock( + in_channels=out_channels, + out_channels=out_channels, + num_blocks=num_blocks, + hidden_ratio=self.hidden_ratio, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=2)) + stage = [] + + ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block) + + stage.append(ef_block) + + if use_spp: + spp = SPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if self.use_cspsppf: + spp = CSPSPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/yolov7_backbone.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/yolov7_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..bb9a5eed85ca1ee6884f7348ef3745a9ceaba032 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/backbones/yolov7_backbone.py @@ -0,0 +1,285 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.backbones.csp_darknet import Focus +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import MaxPoolAndStrideConvBlock +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class YOLOv7Backbone(BaseBackbone): + """Backbone used in YOLOv7. + + Args: + arch (str): Architecture of YOLOv7Defaults to L. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', requires_grad=True). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + """ + _tiny_stage1_cfg = dict(type='TinyDownSampleBlock', middle_ratio=0.5) + _tiny_stage2_4_cfg = dict(type='TinyDownSampleBlock', middle_ratio=1.0) + _l_expand_channel_2x = dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.5, + num_blocks=2, + num_convs_in_block=2) + _l_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.25, + block_ratio=0.25, + num_blocks=2, + num_convs_in_block=2) + _x_expand_channel_2x = dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2) + _x_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.2, + block_ratio=0.2, + num_blocks=3, + num_convs_in_block=2) + _w_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.5, + num_blocks=2, + num_convs_in_block=2) + _e_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2) + _d_no_change_channel = dict( + type='ELANBlock', + middle_ratio=1 / 3, + block_ratio=1 / 3, + num_blocks=4, + num_convs_in_block=2) + _e2e_no_change_channel = dict( + type='EELANBlock', + num_elan_block=2, + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2) + + # From left to right: + # in_channels, out_channels, Block_params + arch_settings = { + 'Tiny': [[64, 64, _tiny_stage1_cfg], [64, 128, _tiny_stage2_4_cfg], + [128, 256, _tiny_stage2_4_cfg], + [256, 512, _tiny_stage2_4_cfg]], + 'L': [[64, 256, _l_expand_channel_2x], + [256, 512, _l_expand_channel_2x], + [512, 1024, _l_expand_channel_2x], + [1024, 1024, _l_no_change_channel]], + 'X': [[80, 320, _x_expand_channel_2x], + [320, 640, _x_expand_channel_2x], + [640, 1280, _x_expand_channel_2x], + [1280, 1280, _x_no_change_channel]], + 'W': + [[64, 128, _w_no_change_channel], [128, 256, _w_no_change_channel], + [256, 512, _w_no_change_channel], [512, 768, _w_no_change_channel], + [768, 1024, _w_no_change_channel]], + 'E': + [[80, 160, _e_no_change_channel], [160, 320, _e_no_change_channel], + [320, 640, _e_no_change_channel], [640, 960, _e_no_change_channel], + [960, 1280, _e_no_change_channel]], + 'D': [[96, 192, + _d_no_change_channel], [192, 384, _d_no_change_channel], + [384, 768, _d_no_change_channel], + [768, 1152, _d_no_change_channel], + [1152, 1536, _d_no_change_channel]], + 'E2E': [[80, 160, _e2e_no_change_channel], + [160, 320, _e2e_no_change_channel], + [320, 640, _e2e_no_change_channel], + [640, 960, _e2e_no_change_channel], + [960, 1280, _e2e_no_change_channel]], + } + + def __init__(self, + arch: str = 'L', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + assert arch in self.arch_settings.keys() + self.arch = arch + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + if self.arch in ['L', 'X']: + stem = nn.Sequential( + ConvModule( + 3, + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + elif self.arch == 'Tiny': + stem = nn.Sequential( + ConvModule( + 3, + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + elif self.arch in ['W', 'E', 'D', 'E2E']: + stem = Focus( + 3, + int(self.arch_setting[0][0] * self.widen_factor), + kernel_size=3, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return stem + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, stage_block_cfg = setting + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + + stage_block_cfg = stage_block_cfg.copy() + stage_block_cfg.setdefault('norm_cfg', self.norm_cfg) + stage_block_cfg.setdefault('act_cfg', self.act_cfg) + + stage_block_cfg['in_channels'] = in_channels + stage_block_cfg['out_channels'] = out_channels + + stage = [] + if self.arch in ['W', 'E', 'D', 'E2E']: + stage_block_cfg['in_channels'] = out_channels + elif self.arch in ['L', 'X']: + if stage_idx == 0: + stage_block_cfg['in_channels'] = out_channels // 2 + + downsample_layer = self._build_downsample_layer( + stage_idx, in_channels, out_channels) + stage.append(MODELS.build(stage_block_cfg)) + if downsample_layer is not None: + stage.insert(0, downsample_layer) + return stage + + def _build_downsample_layer(self, stage_idx: int, in_channels: int, + out_channels: int) -> Optional[nn.Module]: + """Build a downsample layer pre stage.""" + if self.arch in ['E', 'D', 'E2E']: + downsample_layer = MaxPoolAndStrideConvBlock( + in_channels, + out_channels, + use_in_channels_of_middle=True, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + elif self.arch == 'W': + downsample_layer = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + elif self.arch == 'Tiny': + if stage_idx != 0: + downsample_layer = nn.MaxPool2d(2, 2) + else: + downsample_layer = None + elif self.arch in ['L', 'X']: + if stage_idx == 0: + downsample_layer = ConvModule( + in_channels, + out_channels // 2, + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + downsample_layer = MaxPoolAndStrideConvBlock( + in_channels, + in_channels, + use_in_channels_of_middle=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return downsample_layer diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/data_preprocessors/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/data_preprocessors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3ef4f6d7d801cb8150ebca645ddb3cbf5d1b9599 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/data_preprocessors/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .data_preprocessor import (PPYOLOEBatchRandomResize, + PPYOLOEDetDataPreprocessor, + YOLOv5DetDataPreprocessor, + YOLOXBatchSyncRandomResize) + +__all__ = [ + 'YOLOv5DetDataPreprocessor', 'PPYOLOEDetDataPreprocessor', + 'PPYOLOEBatchRandomResize', 'YOLOXBatchSyncRandomResize' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/data_preprocessors/data_preprocessor.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/data_preprocessors/data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..a29b90844323836e0264f827edf27aa20dca2507 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/data_preprocessors/data_preprocessor.py @@ -0,0 +1,310 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from mmdet.models import BatchSyncRandomResize +from mmdet.models.data_preprocessors import DetDataPreprocessor +from mmengine import MessageHub, is_list_of +from mmengine.structures import BaseDataElement +from torch import Tensor + +from mmyolo.registry import MODELS + +CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str, + None] + + +@MODELS.register_module() +class YOLOXBatchSyncRandomResize(BatchSyncRandomResize): + """YOLOX batch random resize. + + Args: + random_size_range (tuple): The multi-scale random range during + multi-scale training. + interval (int): The iter interval of change + image size. Defaults to 10. + size_divisor (int): Image size divisible factor. + Defaults to 32. + """ + + def forward(self, inputs: Tensor, data_samples: dict) -> Tensor and dict: + """resize a batch of images and bboxes to shape ``self._input_size``""" + h, w = inputs.shape[-2:] + inputs = inputs.float() + assert isinstance(data_samples, dict) + + if self._input_size is None: + self._input_size = (h, w) + scale_y = self._input_size[0] / h + scale_x = self._input_size[1] / w + if scale_x != 1 or scale_y != 1: + inputs = F.interpolate( + inputs, + size=self._input_size, + mode='bilinear', + align_corners=False) + + data_samples['bboxes_labels'][:, 2::2] *= scale_x + data_samples['bboxes_labels'][:, 3::2] *= scale_y + + if 'keypoints' in data_samples: + data_samples['keypoints'][..., 0] *= scale_x + data_samples['keypoints'][..., 1] *= scale_y + + message_hub = MessageHub.get_current_instance() + if (message_hub.get_info('iter') + 1) % self._interval == 0: + self._input_size = self._get_random_size( + aspect_ratio=float(w / h), device=inputs.device) + + return inputs, data_samples + + +@MODELS.register_module() +class YOLOv5DetDataPreprocessor(DetDataPreprocessor): + """Rewrite collate_fn to get faster training speed. + + Note: It must be used together with `mmyolo.datasets.utils.yolov5_collate` + """ + + def __init__(self, *args, non_blocking: Optional[bool] = True, **kwargs): + super().__init__(*args, non_blocking=non_blocking, **kwargs) + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization, padding and bgr2rgb conversion based on + ``DetDataPreprocessorr``. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # TODO: Supports multi-scale training + if self._channel_conversion and inputs.shape[1] == 3: + inputs = inputs[:, [2, 1, 0], ...] + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(inputs, data_samples) + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples_output = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'img_metas': img_metas + } + if 'masks' in data_samples: + data_samples_output['masks'] = data_samples['masks'] + if 'keypoints' in data_samples: + data_samples_output['keypoints'] = data_samples['keypoints'] + data_samples_output['keypoints_visible'] = data_samples[ + 'keypoints_visible'] + + return {'inputs': inputs, 'data_samples': data_samples_output} + + +@MODELS.register_module() +class PPYOLOEDetDataPreprocessor(DetDataPreprocessor): + """Image pre-processor for detection tasks. + + The main difference between PPYOLOEDetDataPreprocessor and + DetDataPreprocessor is the normalization order. The official + PPYOLOE resize image first, and then normalize image. + In DetDataPreprocessor, the order is reversed. + + Note: It must be used together with + `mmyolo.datasets.utils.yolov5_collate` + """ + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization、padding and bgr2rgb conversion based on + ``BaseDataPreprocessor``. This class use batch_augments first, and then + normalize the image, which is different from the `DetDataPreprocessor` + . + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + assert isinstance(data['inputs'], list) and is_list_of( + data['inputs'], torch.Tensor), \ + '"inputs" should be a list of Tensor, but got ' \ + f'{type(data["inputs"])}. The possible reason for this ' \ + 'is that you are not using it with ' \ + '"mmyolo.datasets.utils.yolov5_collate". Please refer to ' \ + '"cconfigs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py".' + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # Process data. + batch_inputs = [] + for _input in inputs: + # channel transform + if self._channel_conversion: + _input = _input[[2, 1, 0], ...] + # Convert to float after channel conversion to ensure + # efficiency + _input = _input.float() + batch_inputs.append(_input) + + # Batch random resize image. + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(batch_inputs, data_samples) + + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'img_metas': img_metas + } + + return {'inputs': inputs, 'data_samples': data_samples} + + +# TODO: No generality. Its input data format is different +# mmdet's batch aug, and it must be compatible in the future. +@MODELS.register_module() +class PPYOLOEBatchRandomResize(BatchSyncRandomResize): + """PPYOLOE batch random resize. + + Args: + random_size_range (tuple): The multi-scale random range during + multi-scale training. + interval (int): The iter interval of change + image size. Defaults to 10. + size_divisor (int): Image size divisible factor. + Defaults to 32. + random_interp (bool): Whether to choose interp_mode randomly. + If set to True, the type of `interp_mode` must be list. + If set to False, the type of `interp_mode` must be str. + Defaults to True. + interp_mode (Union[List, str]): The modes available for resizing + are ('nearest', 'bilinear', 'bicubic', 'area'). + keep_ratio (bool): Whether to keep the aspect ratio when resizing + the image. Now we only support keep_ratio=False. + Defaults to False. + """ + + def __init__(self, + random_size_range: Tuple[int, int], + interval: int = 1, + size_divisor: int = 32, + random_interp=True, + interp_mode: Union[List[str], str] = [ + 'nearest', 'bilinear', 'bicubic', 'area' + ], + keep_ratio: bool = False) -> None: + super().__init__(random_size_range, interval, size_divisor) + self.random_interp = random_interp + self.keep_ratio = keep_ratio + # TODO: need to support keep_ratio==True + assert not self.keep_ratio, 'We do not yet support keep_ratio=True' + + if self.random_interp: + assert isinstance(interp_mode, list) and len(interp_mode) > 1,\ + 'While random_interp==True, the type of `interp_mode`' \ + ' must be list and len(interp_mode) must large than 1' + self.interp_mode_list = interp_mode + self.interp_mode = None + else: + assert isinstance(interp_mode, str),\ + 'While random_interp==False, the type of ' \ + '`interp_mode` must be str' + assert interp_mode in ['nearest', 'bilinear', 'bicubic', 'area'] + self.interp_mode_list = None + self.interp_mode = interp_mode + + def forward(self, inputs: list, + data_samples: dict) -> Tuple[Tensor, Tensor]: + """Resize a batch of images and bboxes to shape ``self._input_size``. + + The inputs and data_samples should be list, and + ``PPYOLOEBatchRandomResize`` must be used with + ``PPYOLOEDetDataPreprocessor`` and ``yolov5_collate`` with + ``use_ms_training == True``. + """ + assert isinstance(inputs, list),\ + 'The type of inputs must be list. The possible reason for this ' \ + 'is that you are not using it with `PPYOLOEDetDataPreprocessor` ' \ + 'and `yolov5_collate` with use_ms_training == True.' + + bboxes_labels = data_samples['bboxes_labels'] + + message_hub = MessageHub.get_current_instance() + if (message_hub.get_info('iter') + 1) % self._interval == 0: + # get current input size + self._input_size, interp_mode = self._get_random_size_and_interp() + if self.random_interp: + self.interp_mode = interp_mode + + # TODO: need to support type(inputs)==Tensor + if isinstance(inputs, list): + outputs = [] + for i in range(len(inputs)): + _batch_input = inputs[i] + h, w = _batch_input.shape[-2:] + scale_y = self._input_size[0] / h + scale_x = self._input_size[1] / w + if scale_x != 1. or scale_y != 1.: + if self.interp_mode in ('nearest', 'area'): + align_corners = None + else: + align_corners = False + _batch_input = F.interpolate( + _batch_input.unsqueeze(0), + size=self._input_size, + mode=self.interp_mode, + align_corners=align_corners) + + # rescale boxes + indexes = bboxes_labels[:, 0] == i + bboxes_labels[indexes, 2] *= scale_x + bboxes_labels[indexes, 3] *= scale_y + bboxes_labels[indexes, 4] *= scale_x + bboxes_labels[indexes, 5] *= scale_y + + data_samples['bboxes_labels'] = bboxes_labels + else: + _batch_input = _batch_input.unsqueeze(0) + + outputs.append(_batch_input) + + # convert to Tensor + return torch.cat(outputs, dim=0), data_samples + else: + raise NotImplementedError('Not implemented yet!') + + def _get_random_size_and_interp(self) -> Tuple[int, int]: + """Randomly generate a shape in ``_random_size_range`` and a + interp_mode in interp_mode_list.""" + size = random.randint(*self._random_size_range) + input_size = (self._size_divisor * size, self._size_divisor * size) + + if self.random_interp: + interp_ind = random.randint(0, len(self.interp_mode_list) - 1) + interp_mode = self.interp_mode_list[interp_ind] + else: + interp_mode = None + return input_size, interp_mode diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..90587c3fbb280082262d48b031a64ea7c69b3dec --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ppyoloe_head import PPYOLOEHead, PPYOLOEHeadModule +from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule +from .rtmdet_ins_head import RTMDetInsSepBNHead, RTMDetInsSepBNHeadModule +from .rtmdet_rotated_head import (RTMDetRotatedHead, + RTMDetRotatedSepBNHeadModule) +from .yolov5_head import YOLOv5Head, YOLOv5HeadModule +from .yolov5_ins_head import YOLOv5InsHead, YOLOv5InsHeadModule +from .yolov6_head import YOLOv6Head, YOLOv6HeadModule +from .yolov7_head import YOLOv7Head, YOLOv7HeadModule, YOLOv7p6HeadModule +from .yolov8_head import YOLOv8Head, YOLOv8HeadModule +from .yolox_head import YOLOXHead, YOLOXHeadModule +from .yolox_pose_head import YOLOXPoseHead, YOLOXPoseHeadModule + +__all__ = [ + 'YOLOv5Head', 'YOLOv6Head', 'YOLOXHead', 'YOLOv5HeadModule', + 'YOLOv6HeadModule', 'YOLOXHeadModule', 'RTMDetHead', + 'RTMDetSepBNHeadModule', 'YOLOv7Head', 'PPYOLOEHead', 'PPYOLOEHeadModule', + 'YOLOv7HeadModule', 'YOLOv7p6HeadModule', 'YOLOv8Head', 'YOLOv8HeadModule', + 'RTMDetRotatedHead', 'RTMDetRotatedSepBNHeadModule', 'RTMDetInsSepBNHead', + 'RTMDetInsSepBNHeadModule', 'YOLOv5InsHead', 'YOLOv5InsHeadModule', + 'YOLOXPoseHead', 'YOLOXPoseHeadModule' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/ppyoloe_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/ppyoloe_head.py new file mode 100644 index 0000000000000000000000000000000000000000..f4689876785c40cbd7449cab8f378c8f6d1c1b89 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/ppyoloe_head.py @@ -0,0 +1,374 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.models.utils import multi_apply +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig, reduce_mean) +from mmengine import MessageHub +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..layers.yolo_bricks import PPYOLOESELayer +from ..utils import gt_instances_preprocess +from .yolov6_head import YOLOv6Head + + +@MODELS.register_module() +class PPYOLOEHeadModule(BaseModule): + """PPYOLOEHead head module used in `PPYOLOE. + + `_. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + reg_max (int): Max value of integral set :math: ``{0, ..., reg_max}`` + in QFL setting. Defaults to 16. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + featmap_strides: Sequence[int] = (8, 16, 32), + reg_max: int = 16, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.reg_max = reg_max + + if isinstance(in_channels, int): + self.in_channels = [int(in_channels * widen_factor) + ] * self.num_levels + else: + self.in_channels = [int(i * widen_factor) for i in in_channels] + + self._init_layers() + + def init_weights(self, prior_prob=0.01): + """Initialize the weight and bias of PPYOLOE head.""" + super().init_weights() + for conv in self.cls_preds: + conv.bias.data.fill_(bias_init_with_prob(prior_prob)) + conv.weight.data.fill_(0.) + + for conv in self.reg_preds: + conv.bias.data.fill_(1.0) + conv.weight.data.fill_(0.) + + def _init_layers(self): + """initialize conv layers in PPYOLOE head.""" + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.cls_stems = nn.ModuleList() + self.reg_stems = nn.ModuleList() + + for in_channel in self.in_channels: + self.cls_stems.append( + PPYOLOESELayer( + in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) + self.reg_stems.append( + PPYOLOESELayer( + in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) + + for in_channel in self.in_channels: + self.cls_preds.append( + nn.Conv2d(in_channel, self.num_classes, 3, padding=1)) + self.reg_preds.append( + nn.Conv2d(in_channel, 4 * (self.reg_max + 1), 3, padding=1)) + + # init proj + proj = torch.arange(self.reg_max + 1, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + def forward(self, x: Tuple[Tensor]) -> Tensor: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions. + """ + assert len(x) == self.num_levels + + return multi_apply(self.forward_single, x, self.cls_stems, + self.cls_preds, self.reg_stems, self.reg_preds) + + def forward_single(self, x: Tensor, cls_stem: nn.ModuleList, + cls_pred: nn.ModuleList, reg_stem: nn.ModuleList, + reg_pred: nn.ModuleList) -> Tensor: + """Forward feature of a single scale level.""" + b, _, h, w = x.shape + avg_feat = F.adaptive_avg_pool2d(x, (1, 1)) + cls_logit = cls_pred(cls_stem(x, avg_feat) + x) + bbox_dist_preds = reg_pred(reg_stem(x, avg_feat)) + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max + 1, h * w]).permute(0, 3, 1, 2) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds + else: + return cls_logit, bbox_preds + + +@MODELS.register_module() +class PPYOLOEHead(YOLOv6Head): + """PPYOLOEHead head used in `PPYOLOE `_. + The YOLOv6 head and the PPYOLOE head are only slightly different. + Distribution focal loss is extra used in PPYOLOE, but not in YOLOv6. + + Args: + head_module(ConfigType): Base module used for YOLOv5Head + prior_generator(dict): Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_dfl (:obj:`ConfigDict` or dict): Config of distribution focal + loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False), + loss_dfl: ConfigType = dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=0.5 / 4), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + self.loss_dfl = MODELS.build(loss_dfl) + # ppyoloe doesn't need loss_obj + self.loss_obj = None + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + + # get epoch information from message hub + message_hub = MessageHub.get_current_instance() + current_epoch = message_hub.get_info('epoch') + + num_imgs = len(batch_img_metas) + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, reg_max+1, n, 4) -> (bs, n, 4, reg_max+1) + flatten_pred_dists = [ + bbox_pred_org.permute(0, 2, 3, 1).reshape( + num_imgs, -1, (self.head_module.reg_max + 1) * 4) + for bbox_pred_org in bbox_dist_preds + ] + + flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1) + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + pred_scores = torch.sigmoid(flatten_cls_preds) + + if current_epoch < self.initial_epoch: + assigned_result = self.initial_assigner( + flatten_pred_bboxes.detach(), self.flatten_priors_train, + self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag) + else: + assigned_result = self.assigner(flatten_pred_bboxes.detach(), + pred_scores.detach(), + self.flatten_priors_train, + gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + # cls loss + with torch.cuda.amp.autocast(enabled=False): + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores) + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + assigned_scores_sum = assigned_scores.sum() + # reduce_mean between all gpus + assigned_scores_sum = torch.clamp( + reduce_mean(assigned_scores_sum), min=1) + loss_cls /= assigned_scores_sum + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, + assigned_bboxes_pos, + weight=bbox_weight, + avg_factor=assigned_scores_sum) + + # dfl loss + dist_mask = fg_mask_pre_prior.unsqueeze(-1).repeat( + [1, 1, (self.head_module.reg_max + 1) * 4]) + + pred_dist_pos = torch.masked_select( + flatten_dist_preds, + dist_mask).reshape([-1, 4, self.head_module.reg_max + 1]) + assigned_ltrb = self.bbox_coder.encode( + self.flatten_priors_train[..., :2] / self.stride_tensor, + assigned_bboxes, + max_dis=self.head_module.reg_max, + eps=0.01) + assigned_ltrb_pos = torch.masked_select( + assigned_ltrb, prior_bbox_mask).reshape([-1, 4]) + loss_dfl = self.loss_dfl( + pred_dist_pos.reshape(-1, self.head_module.reg_max + 1), + assigned_ltrb_pos.reshape(-1), + weight=bbox_weight.expand(-1, 4).reshape(-1), + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + loss_dfl = flatten_pred_bboxes.sum() * 0 + + return dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dfl=loss_dfl) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_head.py new file mode 100644 index 0000000000000000000000000000000000000000..54245a97f404b66eba47e41f03302110c8894134 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_head.py @@ -0,0 +1,368 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, is_norm +from mmdet.models.task_modules.samplers import PseudoSampler +from mmdet.structures.bbox import distance2bbox +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig, reduce_mean) +from mmengine.model import (BaseModule, bias_init_with_prob, constant_init, + normal_init) +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class RTMDetSepBNHeadModule(BaseModule): + """Detection Head of RTMDet. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. Defaults to 1. + feat_channels (int): Number of hidden channels. Used in child classes. + Defaults to 256 + stacked_convs (int): Number of stacking convs of the head. + Defaults to 2. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + share_conv (bool): Whether to share conv layers between stages. + Defaults to True. + pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN')``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + widen_factor: float = 1.0, + num_base_priors: int = 1, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + share_conv: bool = True, + pred_kernel_size: int = 1, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + ): + super().__init__(init_cfg=init_cfg) + self.share_conv = share_conv + self.num_classes = num_classes + self.pred_kernel_size = pred_kernel_size + self.feat_channels = int(feat_channels * widen_factor) + self.stacked_convs = stacked_convs + self.num_base_priors = num_base_priors + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.featmap_strides = featmap_strides + + self.in_channels = int(in_channels * widen_factor) + + self._init_layers() + + def _init_layers(self): + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + + self.rtm_cls = nn.ModuleList() + self.rtm_reg = nn.ModuleList() + for n in range(len(self.featmap_strides)): + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append(cls_convs) + self.reg_convs.append(reg_convs) + + self.rtm_cls.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.num_classes, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + self.rtm_reg.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + + if self.share_conv: + for n in range(len(self.featmap_strides)): + for i in range(self.stacked_convs): + self.cls_convs[n][i].conv = self.cls_convs[0][i].conv + self.reg_convs[n][i].conv = self.reg_convs[0][i].conv + + def init_weights(self) -> None: + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + bias_cls = bias_init_with_prob(0.01) + for rtm_cls, rtm_reg in zip(self.rtm_cls, self.rtm_reg): + normal_init(rtm_cls, std=0.01, bias=bias_cls) + normal_init(rtm_reg, std=0.01) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + """ + + cls_scores = [] + bbox_preds = [] + for idx, x in enumerate(feats): + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + + reg_dist = self.rtm_reg[idx](reg_feat) + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + return tuple(cls_scores), tuple(bbox_preds) + + +@MODELS.register_module() +class RTMDetHead(YOLOv5Head): + """RTMDet head. + + Args: + head_module(ConfigType): Base module used for RTMDetHead + prior_generator: Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmdet.GIoULoss', loss_weight=2.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if self.use_sigmoid_cls: + self.cls_out_channels = self.num_classes + else: + self.cls_out_channels = self.num_classes + 1 + # rtmdet doesn't need loss_obj + self.loss_obj = None + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + if self.train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg.sampler, default_args=dict(context=self)) + else: + self.sampler = PseudoSampler(context=self) + + self.featmap_sizes_train = None + self.flatten_priors_train = None + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + return self.head_module(x) + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + device = cls_scores[0].device + + # If the shape does not equal, generate new one + if featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = featmap_sizes + mlvl_priors_with_stride = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1).contiguous() + + flatten_bboxes = torch.cat([ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ], 1) + flatten_bboxes = flatten_bboxes * self.flatten_priors_train[..., -1, + None] + flatten_bboxes = distance2bbox(self.flatten_priors_train[..., :2], + flatten_bboxes) + + assigned_result = self.assigner(flatten_bboxes.detach(), + flatten_cls_scores.detach(), + self.flatten_priors_train, gt_labels, + gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + label_weights = assigned_result['assigned_labels_weights'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 4) + assign_metrics = assigned_result['assign_metrics'].reshape(-1) + cls_preds = flatten_cls_scores.reshape(-1, self.num_classes) + bbox_preds = flatten_bboxes.reshape(-1, 4) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item() + + loss_cls = self.loss_cls( + cls_preds, (labels, assign_metrics), + label_weights, + avg_factor=avg_factor) + + if len(pos_inds) > 0: + loss_bbox = self.loss_bbox( + bbox_preds[pos_inds], + bbox_targets[pos_inds], + weight=assign_metrics[pos_inds], + avg_factor=avg_factor) + else: + loss_bbox = bbox_preds.sum() * 0 + + return dict(loss_cls=loss_cls, loss_bbox=loss_bbox) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_ins_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_ins_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1d0562aad6fb977516924ef9cd72cdef54ff0016 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_ins_head.py @@ -0,0 +1,725 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, is_norm +from mmcv.ops import batched_nms +from mmdet.models.utils import filter_scores_and_topk +from mmdet.structures.bbox import get_box_tensor, get_box_wh, scale_boxes +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig) +from mmengine import ConfigDict +from mmengine.model import (BaseModule, bias_init_with_prob, constant_init, + normal_init) +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule + + +class MaskFeatModule(BaseModule): + """Mask feature head used in RTMDet-Ins. Copy from mmdet. + + Args: + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels of the mask feature + map branch. + stacked_convs (int): Number of convs in mask feature branch. + num_levels (int): The starting feature map level from RPN that + will be used to predict the mask feature map. + num_prototypes (int): Number of output channel of the mask feature + map branch. This is the channel count of the mask + feature map that to be dynamically convolved with the predicted + kernel. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='ReLU', inplace=True) + norm_cfg (dict): Config dict for normalization layer. Default: None. + """ + + def __init__( + self, + in_channels: int, + feat_channels: int = 256, + stacked_convs: int = 4, + num_levels: int = 3, + num_prototypes: int = 8, + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + norm_cfg: ConfigType = dict(type='BN') + ) -> None: + super().__init__(init_cfg=None) + self.num_levels = num_levels + self.fusion_conv = nn.Conv2d(num_levels * in_channels, in_channels, 1) + convs = [] + for i in range(stacked_convs): + in_c = in_channels if i == 0 else feat_channels + convs.append( + ConvModule( + in_c, + feat_channels, + 3, + padding=1, + act_cfg=act_cfg, + norm_cfg=norm_cfg)) + self.stacked_convs = nn.Sequential(*convs) + self.projection = nn.Conv2d( + feat_channels, num_prototypes, kernel_size=1) + + def forward(self, features: Tuple[Tensor, ...]) -> Tensor: + # multi-level feature fusion + fusion_feats = [features[0]] + size = features[0].shape[-2:] + for i in range(1, self.num_levels): + f = F.interpolate(features[i], size=size, mode='bilinear') + fusion_feats.append(f) + fusion_feats = torch.cat(fusion_feats, dim=1) + fusion_feats = self.fusion_conv(fusion_feats) + # pred mask feats + mask_features = self.stacked_convs(fusion_feats) + mask_features = self.projection(mask_features) + return mask_features + + +@MODELS.register_module() +class RTMDetInsSepBNHeadModule(RTMDetSepBNHeadModule): + """Detection and Instance Segmentation Head of RTMDet. + + Args: + num_classes (int): Number of categories excluding the background + category. + num_prototypes (int): Number of mask prototype features extracted + from the mask head. Defaults to 8. + dyconv_channels (int): Channel of the dynamic conv layers. + Defaults to 8. + num_dyconvs (int): Number of the dynamic convolution layers. + Defaults to 3. + use_sigmoid_cls (bool): Use sigmoid for class prediction. + Defaults to True. + """ + + def __init__(self, + num_classes: int, + *args, + num_prototypes: int = 8, + dyconv_channels: int = 8, + num_dyconvs: int = 3, + use_sigmoid_cls: bool = True, + **kwargs): + self.num_prototypes = num_prototypes + self.num_dyconvs = num_dyconvs + self.dyconv_channels = dyconv_channels + self.use_sigmoid_cls = use_sigmoid_cls + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + super().__init__(num_classes=num_classes, *args, **kwargs) + + def _init_layers(self): + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.kernel_convs = nn.ModuleList() + + self.rtm_cls = nn.ModuleList() + self.rtm_reg = nn.ModuleList() + self.rtm_kernel = nn.ModuleList() + self.rtm_obj = nn.ModuleList() + + # calculate num dynamic parameters + weight_nums, bias_nums = [], [] + for i in range(self.num_dyconvs): + if i == 0: + weight_nums.append( + (self.num_prototypes + 2) * self.dyconv_channels) + bias_nums.append(self.dyconv_channels) + elif i == self.num_dyconvs - 1: + weight_nums.append(self.dyconv_channels) + bias_nums.append(1) + else: + weight_nums.append(self.dyconv_channels * self.dyconv_channels) + bias_nums.append(self.dyconv_channels) + self.weight_nums = weight_nums + self.bias_nums = bias_nums + self.num_gen_params = sum(weight_nums) + sum(bias_nums) + pred_pad_size = self.pred_kernel_size // 2 + + for n in range(len(self.featmap_strides)): + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + kernel_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + kernel_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append(cls_convs) + self.reg_convs.append(cls_convs) + self.kernel_convs.append(kernel_convs) + + self.rtm_cls.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + self.pred_kernel_size, + padding=pred_pad_size)) + self.rtm_reg.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=pred_pad_size)) + self.rtm_kernel.append( + nn.Conv2d( + self.feat_channels, + self.num_gen_params, + self.pred_kernel_size, + padding=pred_pad_size)) + + if self.share_conv: + for n in range(len(self.featmap_strides)): + for i in range(self.stacked_convs): + self.cls_convs[n][i].conv = self.cls_convs[0][i].conv + self.reg_convs[n][i].conv = self.reg_convs[0][i].conv + + self.mask_head = MaskFeatModule( + in_channels=self.in_channels, + feat_channels=self.feat_channels, + stacked_convs=4, + num_levels=len(self.featmap_strides), + num_prototypes=self.num_prototypes, + act_cfg=self.act_cfg, + norm_cfg=self.norm_cfg) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + bias_cls = bias_init_with_prob(0.01) + for rtm_cls, rtm_reg, rtm_kernel in zip(self.rtm_cls, self.rtm_reg, + self.rtm_kernel): + normal_init(rtm_cls, std=0.01, bias=bias_cls) + normal_init(rtm_reg, std=0.01, bias=1) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale + levels, each is a 4D-tensor, the channels number is + num_gen_params. + - mask_feat (Tensor): Mask prototype features. + Has shape (batch_size, num_prototypes, H, W). + """ + mask_feat = self.mask_head(feats) + + cls_scores = [] + bbox_preds = [] + kernel_preds = [] + for idx, (x, stride) in enumerate(zip(feats, self.featmap_strides)): + cls_feat = x + reg_feat = x + kernel_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for kernel_layer in self.kernel_convs[idx]: + kernel_feat = kernel_layer(kernel_feat) + kernel_pred = self.rtm_kernel[idx](kernel_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + reg_dist = self.rtm_reg[idx](reg_feat) + + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + kernel_preds.append(kernel_pred) + return tuple(cls_scores), tuple(bbox_preds), tuple( + kernel_preds), mask_feat + + +@MODELS.register_module() +class RTMDetInsSepBNHead(RTMDetHead): + """RTMDet Instance Segmentation head. + + Args: + head_module(ConfigType): Base module used for RTMDetInsSepBNHead + prior_generator: Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_mask (:obj:`ConfigDict` or dict): Config of mask loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmdet.GIoULoss', loss_weight=2.0), + loss_mask=dict( + type='mmdet.DiceLoss', + loss_weight=2.0, + eps=5e-6, + reduction='mean'), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if isinstance(self.head_module, RTMDetInsSepBNHeadModule): + assert self.use_sigmoid_cls == self.head_module.use_sigmoid_cls + self.loss_mask = MODELS.build(loss_mask) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + kernel_preds: List[Tensor], + mask_feats: Tensor, + score_factors: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted from the head into + bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + kernel_preds (list[Tensor]): Kernel predictions of dynamic + convs for all scale levels, each is a 4D-tensor, has shape + (batch_size, num_params, H, W). + mask_feats (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, num_prototypes, H, W). + score_factors (list[Tensor], optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, num_priors * 1, H, W). Defaults to None. + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection and instance + segmentation results of each image after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_kernel_preds = [ + kernel_pred.permute(0, 2, 3, + 1).reshape(num_imgs, -1, + self.head_module.num_gen_params) + for kernel_pred in kernel_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[..., :2].unsqueeze(0), flatten_bbox_preds, + flatten_stride) + + flatten_kernel_preds = torch.cat(flatten_kernel_preds, dim=1) + + results_list = [] + for (bboxes, scores, kernel_pred, mask_feat, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_kernel_preds, mask_feats, + batch_img_metas): + ori_shape = img_meta['ori_shape'] + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2] + empty_results.masks = torch.zeros( + size=(0, h, w), dtype=torch.bool, device=bboxes.device) + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict( + labels=labels[:, 0], + kernel_pred=kernel_pred, + priors=flatten_priors)) + labels = results['labels'] + kernel_pred = results['kernel_pred'] + priors = results['priors'] + else: + out = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict( + kernel_pred=kernel_pred, priors=flatten_priors)) + scores, labels, keep_idxs, filtered_results = out + kernel_pred = filtered_results['kernel_pred'] + priors = filtered_results['priors'] + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=bboxes[keep_idxs], + kernels=kernel_pred, + priors=priors) + + if rescale: + if pad_param is not None: + results.bboxes -= results.bboxes.new_tensor([ + pad_param[2], pad_param[0], pad_param[2], pad_param[0] + ]) + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_mask_post_process( + results=results, + mask_feat=mask_feat, + cfg=cfg, + rescale_bbox=False, + rescale_mask=rescale, + with_nms=with_nms, + pad_param=pad_param, + img_meta=img_meta) + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results_list.append(results) + return results_list + + def _bbox_mask_post_process( + self, + results: InstanceData, + mask_feat: Tensor, + cfg: ConfigDict, + rescale_bbox: bool = False, + rescale_mask: bool = True, + with_nms: bool = True, + pad_param: Optional[np.ndarray] = None, + img_meta: Optional[dict] = None) -> InstanceData: + """bbox and mask post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually `with_nms` is False is used for aug test. + + Args: + results (:obj:`InstaceData`): Detection instance results, + each item has shape (num_bboxes, ). + mask_feat (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, num_prototypes, H, W). + cfg (ConfigDict): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale_bbox (bool): If True, return boxes in original image space. + Default to False. + rescale_mask (bool): If True, return masks in original image space. + Default to True. + with_nms (bool): If True, do nms before return boxes. + Default to True. + img_meta (dict, optional): Image meta info. Defaults to None. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + if rescale_bbox: + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + + if hasattr(results, 'score_factors'): + # TODO: Add sqrt operation in order to be consistent with + # the paper. + score_factors = results.pop('score_factors') + results.scores = results.scores * score_factors + + # filter small size bboxes + if cfg.get('min_bbox_size', -1) >= 0: + w, h = get_box_wh(results.bboxes) + valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) + if not valid_mask.all(): + results = results[valid_mask] + + # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg + assert with_nms, 'with_nms must be True for RTMDet-Ins' + if results.bboxes.numel() > 0: + bboxes = get_box_tensor(results.bboxes) + det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, + results.labels, cfg.nms) + results = results[keep_idxs] + # some nms would reweight the score, such as softnms + results.scores = det_bboxes[:, -1] + results = results[:cfg.max_per_img] + + # process masks + mask_logits = self._mask_predict_by_feat(mask_feat, + results.kernels, + results.priors) + + stride = self.prior_generator.strides[0][0] + mask_logits = F.interpolate( + mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear') + if rescale_mask: + # TODO: When use mmdet.Resize or mmdet.Pad, will meet bug + # Use img_meta to crop and resize + ori_h, ori_w = img_meta['ori_shape'][:2] + if isinstance(pad_param, np.ndarray): + pad_param = pad_param.astype(np.int32) + crop_y1, crop_y2 = pad_param[ + 0], mask_logits.shape[-2] - pad_param[1] + crop_x1, crop_x2 = pad_param[ + 2], mask_logits.shape[-1] - pad_param[3] + mask_logits = mask_logits[..., crop_y1:crop_y2, + crop_x1:crop_x2] + mask_logits = F.interpolate( + mask_logits, + size=[ori_h, ori_w], + mode='bilinear', + align_corners=False) + + masks = mask_logits.sigmoid().squeeze(0) + masks = masks > cfg.mask_thr_binary + results.masks = masks + else: + h, w = img_meta['ori_shape'][:2] if rescale_mask else img_meta[ + 'img_shape'][:2] + results.masks = torch.zeros( + size=(results.bboxes.shape[0], h, w), + dtype=torch.bool, + device=results.bboxes.device) + return results + + def _mask_predict_by_feat(self, mask_feat: Tensor, kernels: Tensor, + priors: Tensor) -> Tensor: + """Generate mask logits from mask features with dynamic convs. + + Args: + mask_feat (Tensor): Mask prototype features. + Has shape (num_prototypes, H, W). + kernels (Tensor): Kernel parameters for each instance. + Has shape (num_instance, num_params) + priors (Tensor): Center priors for each instance. + Has shape (num_instance, 4). + Returns: + Tensor: Instance segmentation masks for each instance. + Has shape (num_instance, H, W). + """ + num_inst = kernels.shape[0] + h, w = mask_feat.size()[-2:] + if num_inst < 1: + return torch.empty( + size=(num_inst, h, w), + dtype=mask_feat.dtype, + device=mask_feat.device) + if len(mask_feat.shape) < 4: + mask_feat.unsqueeze(0) + + coord = self.prior_generator.single_level_grid_priors( + (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2) + num_inst = priors.shape[0] + points = priors[:, :2].reshape(-1, 1, 2) + strides = priors[:, 2:].reshape(-1, 1, 2) + relative_coord = (points - coord).permute(0, 2, 1) / ( + strides[..., 0].reshape(-1, 1, 1) * 8) + relative_coord = relative_coord.reshape(num_inst, 2, h, w) + + mask_feat = torch.cat( + [relative_coord, + mask_feat.repeat(num_inst, 1, 1, 1)], dim=1) + weights, biases = self.parse_dynamic_params(kernels) + + n_layers = len(weights) + x = mask_feat.reshape(1, -1, h, w) + for i, (weight, bias) in enumerate(zip(weights, biases)): + x = F.conv2d( + x, weight, bias=bias, stride=1, padding=0, groups=num_inst) + if i < n_layers - 1: + x = F.relu(x) + x = x.reshape(num_inst, h, w) + return x + + def parse_dynamic_params(self, flatten_kernels: Tensor) -> tuple: + """split kernel head prediction to conv weight and bias.""" + n_inst = flatten_kernels.size(0) + n_layers = len(self.head_module.weight_nums) + params_splits = list( + torch.split_with_sizes( + flatten_kernels, + self.head_module.weight_nums + self.head_module.bias_nums, + dim=1)) + weight_splits = params_splits[:n_layers] + bias_splits = params_splits[n_layers:] + for i in range(n_layers): + if i < n_layers - 1: + weight_splits[i] = weight_splits[i].reshape( + n_inst * self.head_module.dyconv_channels, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape( + n_inst * self.head_module.dyconv_channels) + else: + weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst) + + return weight_splits, bias_splits + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + raise NotImplementedError diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_rotated_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_rotated_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1428b4fd05065e3dba764313febc46d6125408ac --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_rotated_head.py @@ -0,0 +1,641 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from typing import List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +from mmdet.models.utils import filter_scores_and_topk +from mmdet.structures.bbox import HorizontalBoxes, distance2bbox +from mmdet.structures.bbox.transforms import bbox_cxcywh_to_xyxy, scale_boxes +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig, reduce_mean) +from mmengine.config import ConfigDict +from mmengine.model import normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess +from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule + +try: + from mmrotate.structures.bbox import RotatedBoxes, distance2obb + MMROTATE_AVAILABLE = True +except ImportError: + RotatedBoxes = None + distance2obb = None + MMROTATE_AVAILABLE = False + + +@MODELS.register_module() +class RTMDetRotatedSepBNHeadModule(RTMDetSepBNHeadModule): + """Detection Head Module of RTMDet-R. + + Compared with RTMDet Detection Head Module, RTMDet-R adds + a conv for angle prediction. + An `angle_out_dim` arg is added, which is generated by the + angle_coder module and controls the angle pred dim. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. Defaults to 1. + feat_channels (int): Number of hidden channels. Used in child classes. + Defaults to 256 + stacked_convs (int): Number of stacking convs of the head. + Defaults to 2. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + share_conv (bool): Whether to share conv layers between stages. + Defaults to True. + pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1. + angle_out_dim (int): Encoded length of angle, will passed by head. + Defaults to 1. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN')``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + widen_factor: float = 1.0, + num_base_priors: int = 1, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + share_conv: bool = True, + pred_kernel_size: int = 1, + angle_out_dim: int = 1, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + ): + self.angle_out_dim = angle_out_dim + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + widen_factor=widen_factor, + num_base_priors=num_base_priors, + feat_channels=feat_channels, + stacked_convs=stacked_convs, + featmap_strides=featmap_strides, + share_conv=share_conv, + pred_kernel_size=pred_kernel_size, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def _init_layers(self): + """Initialize layers of the head.""" + super()._init_layers() + self.rtm_ang = nn.ModuleList() + for _ in range(len(self.featmap_strides)): + self.rtm_ang.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.angle_out_dim, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + for rtm_ang in self.rtm_ang: + normal_init(rtm_ang, std=0.01) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + - angle_preds (list[Tensor]): Angle prediction for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * angle_out_dim. + """ + + cls_scores = [] + bbox_preds = [] + angle_preds = [] + for idx, x in enumerate(feats): + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + + reg_dist = self.rtm_reg[idx](reg_feat) + angle_pred = self.rtm_ang[idx](reg_feat) + + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + angle_preds.append(angle_pred) + return tuple(cls_scores), tuple(bbox_preds), tuple(angle_preds) + + +@MODELS.register_module() +class RTMDetRotatedHead(RTMDetHead): + """RTMDet-R head. + + Compared with RTMDetHead, RTMDetRotatedHead add some args to support + rotated object detection. + + - `angle_version` used to limit angle_range during training. + - `angle_coder` used to encode and decode angle, which is similar + to bbox_coder. + - `use_hbbox_loss` and `loss_angle` allow custom regression loss + calculation for rotated box. + + There are three combination options for regression: + + 1. `use_hbbox_loss=False` and loss_angle is None. + + .. code:: text + + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──►rbox_pred──(xywha)─►loss_bbox + │ ▲ + └────►decode──(a)─┘ + + 2. `use_hbbox_loss=False` and loss_angle is specified. + A angle loss is added on angle_pred. + + .. code:: text + + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──►rbox_pred──(xywha)─►loss_bbox + │ ▲ + ├────►decode──(a)─┘ + │ + └───────────────────────────────────────────►loss_angle + + 3. `use_hbbox_loss=True` and loss_angle is specified. + In this case the loss_angle must be set. + + .. code:: text + + bbox_pred──(tblr)──►decode──►hbox_pred──(xyxy)──►loss_bbox + + angle_pred──────────────────────────────────────►loss_angle + + - There's a `decoded_with_angle` flag in test_cfg, which is similar + to training process. + + When `decoded_with_angle=True`: + + .. code:: text + + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──(xywha)──►rbox_pred + │ ▲ + └────►decode──(a)─┘ + + When `decoded_with_angle=False`: + + .. code:: text + + bbox_pred──(tblr)─►decode + │ (xyxy) + ▼ + format───(xywh)──►concat──(xywha)──►rbox_pred + ▲ + angle_pred────────►decode────(a)───────┘ + + Args: + head_module(ConfigType): Base module used for RTMDetRotatedHead. + prior_generator: Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + angle_version (str): Angle representations. Defaults to 'le90'. + use_hbbox_loss (bool): If true, use horizontal bbox loss and + loss_angle should not be None. Default to False. + angle_coder (:obj:`ConfigDict` or dict): Config of angle coder. + loss_angle (:obj:`ConfigDict` or dict, optional): Config of angle loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', strides=[8, 16, 32], + offset=0), + bbox_coder: ConfigType = dict(type='DistanceAnglePointCoder'), + loss_cls: ConfigType = dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmrotate.RotatedIoULoss', mode='linear', + loss_weight=2.0), + angle_version: str = 'le90', + use_hbbox_loss: bool = False, + angle_coder: ConfigType = dict(type='mmrotate.PseudoAngleCoder'), + loss_angle: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + if not MMROTATE_AVAILABLE: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + self.angle_version = angle_version + self.use_hbbox_loss = use_hbbox_loss + if self.use_hbbox_loss: + assert loss_angle is not None, \ + ('When use hbbox loss, loss_angle needs to be specified') + self.angle_coder = TASK_UTILS.build(angle_coder) + self.angle_out_dim = self.angle_coder.encode_size + if head_module.get('angle_out_dim') is not None: + warnings.warn('angle_out_dim will be overridden by angle_coder ' + 'and does not need to be set manually') + + head_module['angle_out_dim'] = self.angle_out_dim + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + if loss_angle is not None: + self.loss_angle = MODELS.build(loss_angle) + else: + self.loss_angle = None + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + angle_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into bbox + results. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + angle_preds (list[Tensor]): Box angle for each scale level + with shape (N, num_points * angle_dim, H, W) + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 5), + the last dimension 4 arrange as (x, y, w, h, angle). + """ + assert len(cls_scores) == len(bbox_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + # Whether to decode rbox with angle. + # different setting lead to different final results. + # Defaults to True. + decode_with_angle = cfg.get('decode_with_angle', True) + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_angle_preds = [ + angle_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.angle_out_dim) + for angle_pred in angle_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_angle_preds = torch.cat(flatten_angle_preds, dim=1) + flatten_angle_preds = self.angle_coder.decode( + flatten_angle_preds, keepdim=True) + + if decode_with_angle: + flatten_rbbox_preds = torch.cat( + [flatten_bbox_preds, flatten_angle_preds], dim=-1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_rbbox_preds, flatten_stride) + else: + flatten_decoded_hbboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_bbox_preds, flatten_stride) + flatten_decoded_hbboxes = HorizontalBoxes.xyxy_to_cxcywh( + flatten_decoded_hbboxes) + flatten_decoded_bboxes = torch.cat( + [flatten_decoded_hbboxes, flatten_angle_preds], dim=-1) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(num_imgs)] + + results_list = [] + for (bboxes, scores, objectness, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, batch_img_metas): + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = RotatedBoxes(bboxes) + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0])) + labels = results['labels'] + else: + scores, labels, keep_idxs, _ = filter_scores_and_topk( + scores, score_thr, nms_pre) + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=RotatedBoxes(bboxes[keep_idxs])) + + if rescale: + if pad_param is not None: + results.bboxes.translate_([-pad_param[2], -pad_param[0]]) + + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + + results_list.append(results) + return results_list + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + angle_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + angle_preds (list[Tensor]): Angle prediction for each scale + level with shape (N, num_anchors * angle_out_dim, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xywha + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + device = cls_scores[0].device + + # If the shape does not equal, generate new one + if featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = featmap_sizes + mlvl_priors_with_stride = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1).contiguous() + + flatten_tblrs = torch.cat([ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ], 1) + flatten_tblrs = flatten_tblrs * self.flatten_priors_train[..., -1, + None] + flatten_angles = torch.cat([ + angle_pred.permute(0, 2, 3, 1).reshape( + num_imgs, -1, self.angle_out_dim) for angle_pred in angle_preds + ], 1) + flatten_decoded_angle = self.angle_coder.decode( + flatten_angles, keepdim=True) + flatten_tblra = torch.cat([flatten_tblrs, flatten_decoded_angle], + dim=-1) + flatten_rbboxes = distance2obb( + self.flatten_priors_train[..., :2], + flatten_tblra, + angle_version=self.angle_version) + if self.use_hbbox_loss: + flatten_hbboxes = distance2bbox(self.flatten_priors_train[..., :2], + flatten_tblrs) + + assigned_result = self.assigner(flatten_rbboxes.detach(), + flatten_cls_scores.detach(), + self.flatten_priors_train, gt_labels, + gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + label_weights = assigned_result['assigned_labels_weights'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 5) + assign_metrics = assigned_result['assign_metrics'].reshape(-1) + cls_preds = flatten_cls_scores.reshape(-1, self.num_classes) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item() + + loss_cls = self.loss_cls( + cls_preds, (labels, assign_metrics), + label_weights, + avg_factor=avg_factor) + + pos_bbox_targets = bbox_targets[pos_inds] + + if self.use_hbbox_loss: + bbox_preds = flatten_hbboxes.reshape(-1, 4) + pos_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets[:, :4]) + else: + bbox_preds = flatten_rbboxes.reshape(-1, 5) + angle_preds = flatten_angles.reshape(-1, self.angle_out_dim) + + if len(pos_inds) > 0: + loss_bbox = self.loss_bbox( + bbox_preds[pos_inds], + pos_bbox_targets, + weight=assign_metrics[pos_inds], + avg_factor=avg_factor) + loss_angle = angle_preds.sum() * 0 + if self.loss_angle is not None: + pos_angle_targets = bbox_targets[pos_inds][:, 4:5] + pos_angle_targets = self.angle_coder.encode(pos_angle_targets) + loss_angle = self.loss_angle( + angle_preds[pos_inds], + pos_angle_targets, + weight=assign_metrics[pos_inds], + avg_factor=avg_factor) + else: + loss_bbox = bbox_preds.sum() * 0 + loss_angle = angle_preds.sum() * 0 + + losses = dict() + losses['loss_cls'] = loss_cls + losses['loss_bbox'] = loss_bbox + if self.loss_angle is not None: + losses['loss_angle'] = loss_angle + + return losses diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_head.py new file mode 100644 index 0000000000000000000000000000000000000000..fb24617fc17c2861ea150b0fb9ceb3d8a145bb9d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_head.py @@ -0,0 +1,895 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmdet.models.dense_heads.base_dense_head import BaseDenseHead +from mmdet.models.utils import filter_scores_and_topk, multi_apply +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig) +from mmengine.config import ConfigDict +from mmengine.dist import get_dist_info +from mmengine.logging import print_log +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import make_divisible + + +def get_prior_xy_info(index: int, num_base_priors: int, + featmap_sizes: int) -> Tuple[int, int, int]: + """Get prior index and xy index in feature map by flatten index.""" + _, featmap_w = featmap_sizes + priors = index % num_base_priors + xy_index = index // num_base_priors + grid_y = xy_index // featmap_w + grid_x = xy_index % featmap_w + return priors, grid_x, grid_y + + +@MODELS.register_module() +class YOLOv5HeadModule(BaseModule): + """YOLOv5Head head module used in `YOLOv5`. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 3, + featmap_strides: Sequence[int] = (8, 16, 32), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.widen_factor = widen_factor + + self.featmap_strides = featmap_strides + self.num_out_attrib = 5 + self.num_classes + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + + if isinstance(in_channels, int): + self.in_channels = [make_divisible(in_channels, widen_factor) + ] * self.num_levels + else: + self.in_channels = [ + make_divisible(i, widen_factor) for i in in_channels + ] + + self._init_layers() + + def _init_layers(self): + """initialize conv layers in YOLOv5 head.""" + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Conv2d(self.in_channels[i], + self.num_base_priors * self.num_out_attrib, + 1) + + self.convs_pred.append(conv_pred) + + def init_weights(self): + """Initialize the bias of YOLOv5 head.""" + super().init_weights() + for mi, s in zip(self.convs_pred, self.featmap_strides): # from + b = mi.bias.data.view(self.num_base_priors, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + # NOTE: The following initialization can only be performed on the + # bias of the category, if the following initialization is + # performed on the bias of mask coefficient, + # there will be a significant decrease in mask AP. + b.data[:, 5:5 + self.num_classes] += math.log( + 0.6 / (self.num_classes - 0.999999)) + + mi.bias.data = b.view(-1) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.convs_pred) + + def forward_single(self, x: Tensor, + convs: nn.Module) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + pred_map = convs(x) + bs, _, ny, nx = pred_map.shape + pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib, + ny, nx) + + cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx) + bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx) + + return cls_score, bbox_pred, objectness + + +@MODELS.register_module() +class YOLOv5Head(BaseDenseHead): + """YOLOv5Head head used in `YOLOv5`. + + Args: + head_module(ConfigType): Base module used for YOLOv5Head + prior_generator(dict): Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss. + prior_match_thr (float): Defaults to 4.0. + ignore_iof_thr (float): Defaults to -1.0. + obj_level_weights (List[float]): Defaults to [4.0, 1.0, 0.4]. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.YOLOAnchorGenerator', + base_sizes=[[(10, 13), (16, 30), (33, 23)], + [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]], + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='YOLOv5BBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=0.5), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xywh', + eps=1e-7, + reduction='mean', + loss_weight=0.05, + return_iou=True), + loss_obj: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=1.0), + prior_match_thr: float = 4.0, + near_neighbor_thr: float = 0.5, + ignore_iof_thr: float = -1.0, + obj_level_weights: List[float] = [4.0, 1.0, 0.4], + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + self.head_module = MODELS.build(head_module) + self.num_classes = self.head_module.num_classes + self.featmap_strides = self.head_module.featmap_strides + self.num_levels = len(self.featmap_strides) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self.loss_cls: nn.Module = MODELS.build(loss_cls) + self.loss_bbox: nn.Module = MODELS.build(loss_bbox) + self.loss_obj: nn.Module = MODELS.build(loss_obj) + + self.prior_generator = TASK_UTILS.build(prior_generator) + self.bbox_coder = TASK_UTILS.build(bbox_coder) + self.num_base_priors = self.prior_generator.num_base_priors[0] + + self.featmap_sizes = [torch.empty(1)] * self.num_levels + + self.prior_match_thr = prior_match_thr + self.near_neighbor_thr = near_neighbor_thr + self.obj_level_weights = obj_level_weights + self.ignore_iof_thr = ignore_iof_thr + + self.special_init() + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + assert len(self.obj_level_weights) == len( + self.featmap_strides) == self.num_levels + if self.prior_match_thr != 4.0: + print_log( + "!!!Now, you've changed the prior_match_thr " + 'parameter to something other than 4.0. Please make sure ' + 'that you have modified both the regression formula in ' + 'bbox_coder and before loss_box computation, ' + 'otherwise the accuracy may be degraded!!!') + + if self.num_classes == 1: + print_log('!!!You are using `YOLOv5Head` with num_classes == 1.' + ' The loss_cls will be 0. This is a normal phenomenon.') + + priors_base_sizes = torch.tensor( + self.prior_generator.base_sizes, dtype=torch.float) + featmap_strides = torch.tensor( + self.featmap_strides, dtype=torch.float)[:, None, None] + self.register_buffer( + 'priors_base_sizes', + priors_base_sizes / featmap_strides, + persistent=False) + + grid_offset = torch.tensor([ + [0, 0], # center + [1, 0], # left + [0, 1], # up + [-1, 0], # right + [0, -1], # bottom + ]).float() + self.register_buffer( + 'grid_offset', grid_offset[:, None], persistent=False) + + prior_inds = torch.arange(self.num_base_priors).float().view( + self.num_base_priors, 1) + self.register_buffer('prior_inds', prior_inds, persistent=False) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + return self.head_module(x) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into + bbox results. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_bbox_preds, flatten_stride) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(num_imgs)] + + results_list = [] + for (bboxes, scores, objectness, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, batch_img_metas): + ori_shape = img_meta['ori_shape'] + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0])) + labels = results['labels'] + else: + scores, labels, keep_idxs, _ = filter_scores_and_topk( + scores, score_thr, nms_pre) + + results = InstanceData( + scores=scores, labels=labels, bboxes=bboxes[keep_idxs]) + + if rescale: + if pad_param is not None: + results.bboxes -= results.bboxes.new_tensor([ + pad_param[2], pad_param[0], pad_param[2], pad_param[0] + ]) + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results_list.append(results) + return results_list + + def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list, + dict]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + + if isinstance(batch_data_samples, list): + losses = super().loss(x, batch_data_samples) + else: + outs = self(x) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + if self.ignore_iof_thr != -1: + # TODO: Support fast version + # convert ignore gt + batch_target_ignore_list = [] + for i, gt_instances_ignore in enumerate(batch_gt_instances_ignore): + bboxes = gt_instances_ignore.bboxes + labels = gt_instances_ignore.labels + index = bboxes.new_full((len(bboxes), 1), i) + # (batch_idx, label, bboxes) + target = torch.cat((index, labels[:, None].float(), bboxes), + dim=1) + batch_target_ignore_list.append(target) + + # (num_bboxes, 6) + batch_gt_targets_ignore = torch.cat( + batch_target_ignore_list, dim=0) + if batch_gt_targets_ignore.shape[0] != 0: + # Consider regions with ignore in annotations + return self._loss_by_feat_with_ignore( + cls_scores, + bbox_preds, + objectnesses, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + batch_gt_instances_ignore=batch_gt_targets_ignore) + + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + device = cls_scores[0].device + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + scaled_factor = torch.ones(7, device=device) + + for i in range(self.num_levels): + batch_size, _, h, w = bbox_preds[i].shape + target_obj = torch.zeros_like(objectnesses[i]) + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor( + bbox_preds[i].shape)[[3, 2, 3, 2]] + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets + priors_inds, (img_inds, class_inds) = priors_inds.long().view( + -1), img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1) + + # 4. Calculate loss + # bbox loss + retained_bbox_pred = bbox_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + priors_base_sizes_i = priors_base_sizes_i[priors_inds] + decoded_bbox_pred = self._decode_bbox_to_xywh( + retained_bbox_pred, priors_base_sizes_i) + loss_box_i, iou = self.loss_bbox(decoded_bbox_pred, bboxes_targets) + loss_box += loss_box_i + + # obj loss + iou = iou.detach().clamp(0) + target_obj[img_inds, priors_inds, grid_y_inds, + grid_x_inds] = iou.type(target_obj.dtype) + loss_obj += self.loss_obj(objectnesses[i], + target_obj) * self.obj_level_weights[i] + + # cls loss + if self.num_classes > 1: + pred_cls_scores = cls_scores[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + target_class = torch.full_like(pred_cls_scores, 0.) + target_class[range(batch_targets_scaled.shape[0]), + class_inds] = 1. + loss_cls += self.loss_cls(pred_cls_scores, target_class) + else: + loss_cls += cls_scores[i].sum() * 0 + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) + + def _convert_gt_to_norm_format(self, + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict]) -> Tensor: + if isinstance(batch_gt_instances, torch.Tensor): + # fast version + img_shape = batch_img_metas[0]['batch_input_shape'] + gt_bboxes_xyxy = batch_gt_instances[:, 2:] + xy1, xy2 = gt_bboxes_xyxy.split((2, 2), dim=-1) + gt_bboxes_xywh = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1) + gt_bboxes_xywh[:, 1::2] /= img_shape[0] + gt_bboxes_xywh[:, 0::2] /= img_shape[1] + batch_gt_instances[:, 2:] = gt_bboxes_xywh + + # (num_base_priors, num_bboxes, 6) + batch_targets_normed = batch_gt_instances.repeat( + self.num_base_priors, 1, 1) + else: + batch_target_list = [] + # Convert xyxy bbox to yolo format. + for i, gt_instances in enumerate(batch_gt_instances): + img_shape = batch_img_metas[i]['batch_input_shape'] + bboxes = gt_instances.bboxes + labels = gt_instances.labels + + xy1, xy2 = bboxes.split((2, 2), dim=-1) + bboxes = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1) + # normalized to 0-1 + bboxes[:, 1::2] /= img_shape[0] + bboxes[:, 0::2] /= img_shape[1] + + index = bboxes.new_full((len(bboxes), 1), i) + # (batch_idx, label, normed_bbox) + target = torch.cat((index, labels[:, None].float(), bboxes), + dim=1) + batch_target_list.append(target) + + # (num_base_priors, num_bboxes, 6) + batch_targets_normed = torch.cat( + batch_target_list, dim=0).repeat(self.num_base_priors, 1, 1) + + # (num_base_priors, num_bboxes, 1) + batch_targets_prior_inds = self.prior_inds.repeat( + 1, batch_targets_normed.shape[1])[..., None] + # (num_base_priors, num_bboxes, 7) + # (img_ind, labels, bbox_cx, bbox_cy, bbox_w, bbox_h, prior_ind) + batch_targets_normed = torch.cat( + (batch_targets_normed, batch_targets_prior_inds), 2) + return batch_targets_normed + + def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes) -> Tensor: + bbox_pred = bbox_pred.sigmoid() + pred_xy = bbox_pred[:, :2] * 2 - 0.5 + pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes + decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1) + return decoded_bbox_pred + + def _loss_by_feat_with_ignore( + self, cls_scores: Sequence[Tensor], bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: Sequence[Tensor]) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (Sequence[Tensor]): Ignore boxes with + batch_ids and labels, each is a 2D-tensor, the channel number + is 6, means that (batch_id, label, xmin, ymin, xmax, ymax). + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + + device = cls_scores[0].device + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + scaled_factor = torch.ones(7, device=device) + + for i in range(self.num_levels): + batch_size, _, h, w = bbox_preds[i].shape + target_obj = torch.zeros_like(objectnesses[i]) + + not_ignore_flags = bbox_preds[i].new_ones(batch_size, + self.num_base_priors, h, + w) + + ignore_overlaps = bbox_overlaps(self.mlvl_priors[i], + batch_gt_instances_ignore[..., 2:], + 'iof') + ignore_max_overlaps, ignore_max_ignore_index = ignore_overlaps.max( + dim=1) + + batch_inds = batch_gt_instances_ignore[:, + 0][ignore_max_ignore_index] + ignore_inds = (ignore_max_overlaps > self.ignore_iof_thr).nonzero( + as_tuple=True)[0] + batch_inds = batch_inds[ignore_inds].long() + ignore_priors, ignore_grid_xs, ignore_grid_ys = get_prior_xy_info( + ignore_inds, self.num_base_priors, self.featmap_sizes[i]) + not_ignore_flags[batch_inds, ignore_priors, ignore_grid_ys, + ignore_grid_xs] = 0 + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], + target_obj, + weight=not_ignore_flags, + avg_factor=max(not_ignore_flags.sum(), + 1)) * self.obj_level_weights[i] + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor( + bbox_preds[i].shape)[[3, 2, 3, 2]] + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], + target_obj, + weight=not_ignore_flags, + avg_factor=max(not_ignore_flags.sum(), + 1)) * self.obj_level_weights[i] + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets + priors_inds, (img_inds, class_inds) = priors_inds.long().view( + -1), img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1) + + # 4. Calculate loss + # bbox loss + retained_bbox_pred = bbox_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + priors_base_sizes_i = priors_base_sizes_i[priors_inds] + decoded_bbox_pred = self._decode_bbox_to_xywh( + retained_bbox_pred, priors_base_sizes_i) + + not_ignore_weights = not_ignore_flags[img_inds, priors_inds, + grid_y_inds, grid_x_inds] + loss_box_i, iou = self.loss_bbox( + decoded_bbox_pred, + bboxes_targets, + weight=not_ignore_weights, + avg_factor=max(not_ignore_weights.sum(), 1)) + loss_box += loss_box_i + + # obj loss + iou = iou.detach().clamp(0) + target_obj[img_inds, priors_inds, grid_y_inds, + grid_x_inds] = iou.type(target_obj.dtype) + loss_obj += self.loss_obj( + objectnesses[i], + target_obj, + weight=not_ignore_flags, + avg_factor=max(not_ignore_flags.sum(), + 1)) * self.obj_level_weights[i] + + # cls loss + if self.num_classes > 1: + pred_cls_scores = cls_scores[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + target_class = torch.full_like(pred_cls_scores, 0.) + target_class[range(batch_targets_scaled.shape[0]), + class_inds] = 1. + loss_cls += self.loss_cls( + pred_cls_scores, + target_class, + weight=not_ignore_weights[:, None].repeat( + 1, self.num_classes), + avg_factor=max(not_ignore_weights.sum(), 1)) + else: + loss_cls += cls_scores[i].sum() * 0 + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_ins_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_ins_head.py new file mode 100644 index 0000000000000000000000000000000000000000..df94f422e904791252067e22ea8e3a643a77a8d0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_ins_head.py @@ -0,0 +1,740 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional, Sequence, Tuple, Union + +import mmcv +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmdet.models.utils import filter_scores_and_topk, multi_apply +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy +from mmdet.utils import ConfigType, OptInstanceList +from mmengine.config import ConfigDict +from mmengine.dist import get_dist_info +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..utils import make_divisible +from .yolov5_head import YOLOv5Head, YOLOv5HeadModule + + +class ProtoModule(BaseModule): + """Mask Proto module for segmentation models of YOLOv5. + + Args: + in_channels (int): Number of channels in the input feature map. + middle_channels (int): Number of channels in the middle feature map. + mask_channels (int): Number of channels in the output mask feature + map. This is the channel count of the mask. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN', momentum=0.03, eps=0.001)``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + """ + + def __init__(self, + *args, + in_channels: int = 32, + middle_channels: int = 256, + mask_channels: int = 32, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + **kwargs): + super().__init__(*args, **kwargs) + self.conv1 = ConvModule( + in_channels, + middle_channels, + kernel_size=3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.conv2 = ConvModule( + middle_channels, + middle_channels, + kernel_size=3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + middle_channels, + mask_channels, + kernel_size=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + return self.conv3(self.conv2(self.upsample(self.conv1(x)))) + + +@MODELS.register_module() +class YOLOv5InsHeadModule(YOLOv5HeadModule): + """Detection and Instance Segmentation Head of YOLOv5. + + Args: + num_classes (int): Number of categories excluding the background + category. + mask_channels (int): Number of channels in the mask feature map. + This is the channel count of the mask. + proto_channels (int): Number of channels in the proto feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN', momentum=0.03, eps=0.001)``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + """ + + def __init__(self, + *args, + num_classes: int, + mask_channels: int = 32, + proto_channels: int = 256, + widen_factor: float = 1.0, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + **kwargs): + self.mask_channels = mask_channels + self.num_out_attrib_with_proto = 5 + num_classes + mask_channels + self.proto_channels = make_divisible(proto_channels, widen_factor) + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + super().__init__( + *args, + num_classes=num_classes, + widen_factor=widen_factor, + **kwargs) + + def _init_layers(self): + """initialize conv layers in YOLOv5 Ins head.""" + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Conv2d( + self.in_channels[i], + self.num_base_priors * self.num_out_attrib_with_proto, 1) + self.convs_pred.append(conv_pred) + + self.proto_pred = ProtoModule( + in_channels=self.in_channels[0], + middle_channels=self.proto_channels, + mask_channels=self.mask_channels, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, objectnesses, and mask predictions. + """ + assert len(x) == self.num_levels + cls_scores, bbox_preds, objectnesses, coeff_preds = multi_apply( + self.forward_single, x, self.convs_pred) + mask_protos = self.proto_pred(x[0]) + return cls_scores, bbox_preds, objectnesses, coeff_preds, mask_protos + + def forward_single( + self, x: Tensor, + convs_pred: nn.Module) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + pred_map = convs_pred(x) + bs, _, ny, nx = pred_map.shape + pred_map = pred_map.view(bs, self.num_base_priors, + self.num_out_attrib_with_proto, ny, nx) + + cls_score = pred_map[:, :, 5:self.num_classes + 5, + ...].reshape(bs, -1, ny, nx) + bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx) + coeff_pred = pred_map[:, :, self.num_classes + 5:, + ...].reshape(bs, -1, ny, nx) + + return cls_score, bbox_pred, objectness, coeff_pred + + +@MODELS.register_module() +class YOLOv5InsHead(YOLOv5Head): + """YOLOv5 Instance Segmentation and Detection head. + + Args: + mask_overlap(bool): Defaults to True. + loss_mask (:obj:`ConfigDict` or dict): Config of mask loss. + loss_mask_weight (float): The weight of mask loss. + """ + + def __init__(self, + *args, + mask_overlap: bool = True, + loss_mask: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=0.05, + **kwargs): + super().__init__(*args, **kwargs) + self.mask_overlap = mask_overlap + self.loss_mask: nn.Module = MODELS.build(loss_mask) + self.loss_mask_weight = loss_mask_weight + + def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list, + dict]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + + if isinstance(batch_data_samples, list): + # TODO: support non-fast version ins segmention + raise NotImplementedError + else: + outs = self(x) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['masks'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + coeff_preds: Sequence[Tensor], + proto_preds: Tensor, + batch_gt_instances: Sequence[InstanceData], + batch_gt_masks: Sequence[Tensor], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + coeff_preds (Sequence[Tensor]): Mask coefficient for each scale + level, each is a 4D-tensor, the channel number is + num_priors * mask_channels. + proto_preds (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, mask_channels, H, W). + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_gt_masks (Sequence[Tensor]): Batch of gt_mask. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + device = cls_scores[0].device + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + loss_mask = torch.zeros(1, device=device) + scaled_factor = torch.ones(8, device=device) + + for i in range(self.num_levels): + batch_size, _, h, w = bbox_preds[i].shape + target_obj = torch.zeros_like(objectnesses[i]) + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + loss_mask += coeff_preds[i].sum() * 0 + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor( + bbox_preds[i].shape)[[3, 2, 3, 2]] + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 8) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + loss_mask += coeff_preds[i].sum() * 0 + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh,\ + priors_targets_inds = _chunk_targets + (priors_inds, targets_inds) = priors_targets_inds.long().T + (img_inds, class_inds) = img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1) + + # 4. Calculate loss + # bbox loss + retained_bbox_pred = bbox_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + priors_base_sizes_i = priors_base_sizes_i[priors_inds] + decoded_bbox_pred = self._decode_bbox_to_xywh( + retained_bbox_pred, priors_base_sizes_i) + loss_box_i, iou = self.loss_bbox(decoded_bbox_pred, bboxes_targets) + loss_box += loss_box_i + + # obj loss + iou = iou.detach().clamp(0) + target_obj[img_inds, priors_inds, grid_y_inds, + grid_x_inds] = iou.type(target_obj.dtype) + loss_obj += self.loss_obj(objectnesses[i], + target_obj) * self.obj_level_weights[i] + + # cls loss + if self.num_classes > 1: + pred_cls_scores = cls_scores[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + target_class = torch.full_like(pred_cls_scores, 0.) + target_class[range(batch_targets_scaled.shape[0]), + class_inds] = 1. + loss_cls += self.loss_cls(pred_cls_scores, target_class) + else: + loss_cls += cls_scores[i].sum() * 0 + + # mask regression + retained_coeff_preds = coeff_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + _, c, mask_h, mask_w = proto_preds.shape + if batch_gt_masks.shape[-2:] != (mask_h, mask_w): + batch_gt_masks = F.interpolate( + batch_gt_masks[None], (mask_h, mask_w), mode='nearest')[0] + + xywh_normed = batch_targets_scaled[:, 2:6] / scaled_factor[2:6] + area_normed = xywh_normed[:, 2:].prod(1) + xywh_scaled = xywh_normed * torch.tensor( + proto_preds.shape, device=device)[[3, 2, 3, 2]] + xyxy_scaled = bbox_cxcywh_to_xyxy(xywh_scaled) + + for bs in range(batch_size): + match_inds = (img_inds == bs) # matching index + if not match_inds.any(): + continue + + if self.mask_overlap: + mask_gti = torch.where( + batch_gt_masks[bs][None] == + targets_inds[match_inds].view(-1, 1, 1), 1.0, 0.0) + else: + mask_gti = batch_gt_masks[targets_inds][match_inds] + + mask_preds = (retained_coeff_preds[match_inds] + @ proto_preds[bs].view(c, -1)).view( + -1, mask_h, mask_w) + loss_mask_full = self.loss_mask(mask_preds, mask_gti) + loss_mask += ( + self.crop_mask(loss_mask_full[None], + xyxy_scaled[match_inds]).mean(dim=(2, 3)) / + area_normed[match_inds]).mean() + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size, + loss_mask=loss_mask * self.loss_mask_weight * world_size) + + def _convert_gt_to_norm_format(self, + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict]) -> Tensor: + """Add target_inds for instance segmentation.""" + batch_targets_normed = super()._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + if self.mask_overlap: + batch_size = len(batch_img_metas) + target_inds = [] + for i in range(batch_size): + # find number of targets of each image + num_gts = (batch_gt_instances[:, 0] == i).sum() + # (num_anchor, num_gts) + target_inds.append( + torch.arange(num_gts, device=batch_gt_instances.device). + float().view(1, num_gts).repeat(self.num_base_priors, 1) + + 1) + target_inds = torch.cat(target_inds, 1) + else: + num_gts = batch_gt_instances.shape[0] + target_inds = torch.arange( + num_gts, device=batch_gt_instances.device).float().view( + 1, num_gts).repeat(self.num_base_priors, 1) + batch_targets_normed = torch.cat( + [batch_targets_normed, target_inds[..., None]], 2) + return batch_targets_normed + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + coeff_preds: Optional[List[Tensor]] = None, + proto_preds: Optional[Tensor] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted from the head into + bbox results. + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + coeff_preds (list[Tensor]): Mask coefficients predictions + for all scale levels, each is a 4D-tensor, has shape + (batch_size, mask_channels, H, W). + proto_preds (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, mask_channels, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + Returns: + list[:obj:`InstanceData`]: Object detection and instance + segmentation results of each image after the post process. + Each item usually contains following keys. + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + assert len(cls_scores) == len(bbox_preds) == len(coeff_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_coeff_preds = [ + coeff_pred.permute(0, 2, 3, + 1).reshape(num_imgs, -1, + self.head_module.mask_channels) + for coeff_pred in coeff_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors.unsqueeze(0), flatten_bbox_preds, flatten_stride) + + flatten_coeff_preds = torch.cat(flatten_coeff_preds, dim=1) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(len(featmap_sizes))] + + results_list = [] + for (bboxes, scores, objectness, coeffs, mask_proto, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, flatten_coeff_preds, + proto_preds, batch_img_metas): + ori_shape = img_meta['ori_shape'] + batch_input_shape = img_meta['batch_input_shape'] + input_shape_h, input_shape_w = batch_input_shape + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + input_shape_withoutpad = (input_shape_h - pad_param[0] - + pad_param[1], input_shape_w - + pad_param[2] - pad_param[3]) + else: + pad_param = None + input_shape_withoutpad = batch_input_shape + scale_factor = (input_shape_withoutpad[1] / ori_shape[1], + input_shape_withoutpad[0] / ori_shape[0]) + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + coeffs = coeffs[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + # NOTE: Important + coeffs *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2] + empty_results.masks = torch.zeros( + size=(0, h, w), dtype=torch.bool, device=bboxes.device) + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0], coeffs=coeffs)) + labels = results['labels'] + coeffs = results['coeffs'] + else: + out = filter_scores_and_topk( + scores, score_thr, nms_pre, results=dict(coeffs=coeffs)) + scores, labels, keep_idxs, filtered_results = out + coeffs = filtered_results['coeffs'] + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=bboxes[keep_idxs], + coeffs=coeffs) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + + if len(results.bboxes): + masks = self.process_mask(mask_proto, results.coeffs, + results.bboxes, + (input_shape_h, input_shape_w), True) + if rescale: + if pad_param is not None: + # bbox minus pad param + top_pad, _, left_pad, _ = pad_param + results.bboxes -= results.bboxes.new_tensor( + [left_pad, top_pad, left_pad, top_pad]) + # mask crop pad param + top, left = int(top_pad), int(left_pad) + bottom, right = int(input_shape_h - + top_pad), int(input_shape_w - + left_pad) + masks = masks[:, :, top:bottom, left:right] + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + fast_test = cfg.get('fast_test', False) + if fast_test: + masks = F.interpolate( + masks, + size=ori_shape, + mode='bilinear', + align_corners=False) + masks = masks.squeeze(0) + masks = masks > cfg.mask_thr_binary + else: + masks.gt_(cfg.mask_thr_binary) + masks = torch.as_tensor(masks, dtype=torch.uint8) + masks = masks[0].permute(1, 2, + 0).contiguous().cpu().numpy() + masks = mmcv.imresize(masks, + (ori_shape[1], ori_shape[0])) + + if len(masks.shape) == 2: + masks = masks[:, :, None] + masks = torch.from_numpy(masks).permute(2, 0, 1) + + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results.masks = masks.bool() + results_list.append(results) + else: + h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2] + results.masks = torch.zeros( + size=(0, h, w), dtype=torch.bool, device=bboxes.device) + results_list.append(results) + return results_list + + def process_mask(self, + mask_proto: Tensor, + mask_coeff_pred: Tensor, + bboxes: Tensor, + shape: Tuple[int, int], + upsample: bool = False) -> Tensor: + """Generate mask logits results. + + Args: + mask_proto (Tensor): Mask prototype features. + Has shape (num_instance, mask_channels). + mask_coeff_pred (Tensor): Mask coefficients prediction for + single image. Has shape (mask_channels, H, W) + bboxes (Tensor): Tensor of the bbox. Has shape (num_instance, 4). + shape (Tuple): Batch input shape of image. + upsample (bool): Whether upsample masks results to batch input + shape. Default to False. + Return: + Tensor: Instance segmentation masks for each instance. + Has shape (num_instance, H, W). + """ + c, mh, mw = mask_proto.shape # CHW + masks = ( + mask_coeff_pred @ mask_proto.float().view(c, -1)).sigmoid().view( + -1, mh, mw)[None] + if upsample: + masks = F.interpolate( + masks, shape, mode='bilinear', align_corners=False) # 1CHW + masks = self.crop_mask(masks, bboxes) + return masks + + def crop_mask(self, masks: Tensor, boxes: Tensor) -> Tensor: + """Crop mask by the bounding box. + + Args: + masks (Tensor): Predicted mask results. Has shape + (1, num_instance, H, W). + boxes (Tensor): Tensor of the bbox. Has shape (num_instance, 4). + Returns: + (torch.Tensor): The masks are being cropped to the bounding box. + """ + _, n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) + r = torch.arange( + w, device=masks.device, + dtype=x1.dtype)[None, None, None, :] # rows shape(1, 1, w, 1) + c = torch.arange( + h, device=masks.device, + dtype=x1.dtype)[None, None, :, None] # cols shape(1, h, 1, 1) + + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov6_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov6_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3b01133f04f467de9beab08ac9bae602d4588a96 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov6_head.py @@ -0,0 +1,396 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.utils import multi_apply +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig) +from mmengine import MessageHub +from mmengine.dist import get_dist_info +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class YOLOv6HeadModule(BaseModule): + """YOLOv6Head head module used in `YOLOv6. + + `_. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors: (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to [8, 16, 32]. + None, otherwise False. Defaults to "auto". + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + reg_max=0, + featmap_strides: Sequence[int] = (8, 16, 32), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + self.reg_max = reg_max + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + if isinstance(in_channels, int): + self.in_channels = [int(in_channels * widen_factor) + ] * self.num_levels + else: + self.in_channels = [int(i * widen_factor) for i in in_channels] + + self._init_layers() + + def _init_layers(self): + """initialize conv layers in YOLOv6 head.""" + # Init decouple head + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.stems = nn.ModuleList() + + if self.reg_max > 1: + proj = torch.arange( + self.reg_max + self.num_base_priors, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + for i in range(self.num_levels): + self.stems.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=1, + stride=1, + padding=1 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=3, + stride=1, + padding=3 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.reg_convs.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=3, + stride=1, + padding=3 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_preds.append( + nn.Conv2d( + in_channels=self.in_channels[i], + out_channels=self.num_base_priors * self.num_classes, + kernel_size=1)) + self.reg_preds.append( + nn.Conv2d( + in_channels=self.in_channels[i], + out_channels=(self.num_base_priors + self.reg_max) * 4, + kernel_size=1)) + + def init_weights(self): + super().init_weights() + bias_init = bias_init_with_prob(0.01) + for conv in self.cls_preds: + conv.bias.data.fill_(bias_init) + conv.weight.data.fill_(0.) + + for conv in self.reg_preds: + conv.bias.data.fill_(1.0) + conv.weight.data.fill_(0.) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions. + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.stems, self.cls_convs, + self.cls_preds, self.reg_convs, self.reg_preds) + + def forward_single(self, x: Tensor, stem: nn.Module, cls_conv: nn.Module, + cls_pred: nn.Module, reg_conv: nn.Module, + reg_pred: nn.Module) -> Tuple[Tensor, Tensor]: + """Forward feature of a single scale level.""" + b, _, h, w = x.shape + y = stem(x) + cls_x = y + reg_x = y + cls_feat = cls_conv(cls_x) + reg_feat = reg_conv(reg_x) + + cls_score = cls_pred(cls_feat) + bbox_dist_preds = reg_pred(reg_feat) + + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max + self.num_base_priors, + h * w]).permute(0, 3, 1, 2) + + # TODO: The get_flops script cannot handle the situation of + # matmul, and needs to be fixed later + # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + + if self.training: + return cls_score, bbox_preds, bbox_dist_preds + else: + return cls_score, bbox_preds + + +@MODELS.register_module() +class YOLOv6Head(YOLOv5Head): + """YOLOv6Head head used in `YOLOv6 `_. + + Args: + head_module(ConfigType): Base module used for YOLOv6Head + prior_generator(dict): Points generator feature maps + in 2D points-based detectors. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + # yolov6 doesn't need loss_obj + self.loss_obj = None + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + if self.train_cfg: + self.initial_epoch = self.train_cfg['initial_epoch'] + self.initial_assigner = TASK_UTILS.build( + self.train_cfg.initial_assigner) + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + + # Add common attributes to reduce calculation + self.featmap_sizes_train = None + self.num_level_priors = None + self.flatten_priors_train = None + self.stride_tensor = None + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + + # get epoch information from message hub + message_hub = MessageHub.get_current_instance() + current_epoch = message_hub.get_info('epoch') + + num_imgs = len(batch_img_metas) + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[:, 0]) + pred_scores = torch.sigmoid(flatten_cls_preds) + + if current_epoch < self.initial_epoch: + assigned_result = self.initial_assigner( + flatten_pred_bboxes.detach(), self.flatten_priors_train, + self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag) + else: + assigned_result = self.assigner(flatten_pred_bboxes.detach(), + pred_scores.detach(), + self.flatten_priors_train, + gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + # cls loss + with torch.cuda.amp.autocast(enabled=False): + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores) + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + # TODO: Add all_reduce makes training more stable + assigned_scores_sum = assigned_scores.sum() + if assigned_scores_sum > 0: + loss_cls /= assigned_scores_sum + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, + assigned_bboxes_pos, + weight=bbox_weight, + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * world_size, loss_bbox=loss_bbox * world_size) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov7_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov7_head.py new file mode 100644 index 0000000000000000000000000000000000000000..124883cf4b4c5b51d6643edc7c2f813178d80c78 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov7_head.py @@ -0,0 +1,404 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.utils import multi_apply +from mmdet.utils import ConfigType, OptInstanceList +from mmengine.dist import get_dist_info +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..layers import ImplicitA, ImplicitM +from ..task_modules.assigners.batch_yolov7_assigner import BatchYOLOv7Assigner +from .yolov5_head import YOLOv5Head, YOLOv5HeadModule + + +@MODELS.register_module() +class YOLOv7HeadModule(YOLOv5HeadModule): + """YOLOv7Head head module used in YOLOv7.""" + + def _init_layers(self): + """initialize conv layers in YOLOv7 head.""" + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Sequential( + ImplicitA(self.in_channels[i]), + nn.Conv2d(self.in_channels[i], + self.num_base_priors * self.num_out_attrib, 1), + ImplicitM(self.num_base_priors * self.num_out_attrib), + ) + self.convs_pred.append(conv_pred) + + def init_weights(self): + """Initialize the bias of YOLOv7 head.""" + super(YOLOv5HeadModule, self).init_weights() + for mi, s in zip(self.convs_pred, self.featmap_strides): # from + mi = mi[1] # nn.Conv2d + + b = mi.bias.data.view(self.num_base_priors, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99)) + + mi.bias.data = b.view(-1) + + +@MODELS.register_module() +class YOLOv7p6HeadModule(YOLOv5HeadModule): + """YOLOv7Head head module used in YOLOv7.""" + + def __init__(self, + *args, + main_out_channels: Sequence[int] = [256, 512, 768, 1024], + aux_out_channels: Sequence[int] = [320, 640, 960, 1280], + use_aux: bool = True, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + **kwargs): + self.main_out_channels = main_out_channels + self.aux_out_channels = aux_out_channels + self.use_aux = use_aux + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + super().__init__(*args, **kwargs) + + def _init_layers(self): + """initialize conv layers in YOLOv7 head.""" + self.main_convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Sequential( + ConvModule( + self.in_channels[i], + self.main_out_channels[i], + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ImplicitA(self.main_out_channels[i]), + nn.Conv2d(self.main_out_channels[i], + self.num_base_priors * self.num_out_attrib, 1), + ImplicitM(self.num_base_priors * self.num_out_attrib), + ) + self.main_convs_pred.append(conv_pred) + + if self.use_aux: + self.aux_convs_pred = nn.ModuleList() + for i in range(self.num_levels): + aux_pred = nn.Sequential( + ConvModule( + self.in_channels[i], + self.aux_out_channels[i], + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(self.aux_out_channels[i], + self.num_base_priors * self.num_out_attrib, 1)) + self.aux_convs_pred.append(aux_pred) + else: + self.aux_convs_pred = [None] * len(self.main_convs_pred) + + def init_weights(self): + """Initialize the bias of YOLOv5 head.""" + super(YOLOv5HeadModule, self).init_weights() + for mi, aux, s in zip(self.main_convs_pred, self.aux_convs_pred, + self.featmap_strides): # from + mi = mi[2] # nn.Conv2d + b = mi.bias.data.view(3, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99)) + mi.bias.data = b.view(-1) + + if self.use_aux: + aux = aux[1] # nn.Conv2d + b = aux.bias.data.view(3, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99)) + mi.bias.data = b.view(-1) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.main_convs_pred, + self.aux_convs_pred) + + def forward_single(self, x: Tensor, convs: nn.Module, + aux_convs: Optional[nn.Module]) \ + -> Tuple[Union[Tensor, List], Union[Tensor, List], + Union[Tensor, List]]: + """Forward feature of a single scale level.""" + + pred_map = convs(x) + bs, _, ny, nx = pred_map.shape + pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib, + ny, nx) + + cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx) + bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx) + + if not self.training or not self.use_aux: + return cls_score, bbox_pred, objectness + else: + aux_pred_map = aux_convs(x) + aux_pred_map = aux_pred_map.view(bs, self.num_base_priors, + self.num_out_attrib, ny, nx) + aux_cls_score = aux_pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx) + aux_bbox_pred = aux_pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + aux_objectness = aux_pred_map[:, :, 4:5, + ...].reshape(bs, -1, ny, nx) + + return [cls_score, + aux_cls_score], [bbox_pred, aux_bbox_pred + ], [objectness, aux_objectness] + + +@MODELS.register_module() +class YOLOv7Head(YOLOv5Head): + """YOLOv7Head head used in `YOLOv7 `_. + + Args: + simota_candidate_topk (int): The candidate top-k which used to + get top-k ious to calculate dynamic-k in BatchYOLOv7Assigner. + Defaults to 10. + simota_iou_weight (float): The scale factor for regression + iou cost in BatchYOLOv7Assigner. Defaults to 3.0. + simota_cls_weight (float): The scale factor for classification + cost in BatchYOLOv7Assigner. Defaults to 1.0. + """ + + def __init__(self, + *args, + simota_candidate_topk: int = 20, + simota_iou_weight: float = 3.0, + simota_cls_weight: float = 1.0, + aux_loss_weights: float = 0.25, + **kwargs): + super().__init__(*args, **kwargs) + self.aux_loss_weights = aux_loss_weights + self.assigner = BatchYOLOv7Assigner( + num_classes=self.num_classes, + num_base_priors=self.num_base_priors, + featmap_strides=self.featmap_strides, + prior_match_thr=self.prior_match_thr, + candidate_topk=simota_candidate_topk, + iou_weight=simota_iou_weight, + cls_weight=simota_cls_weight) + + def loss_by_feat( + self, + cls_scores: Sequence[Union[Tensor, List]], + bbox_preds: Sequence[Union[Tensor, List]], + objectnesses: Sequence[Union[Tensor, List]], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + + if isinstance(cls_scores[0], Sequence): + with_aux = True + batch_size = cls_scores[0][0].shape[0] + device = cls_scores[0][0].device + + bbox_preds_main, bbox_preds_aux = zip(*bbox_preds) + objectnesses_main, objectnesses_aux = zip(*objectnesses) + cls_scores_main, cls_scores_aux = zip(*cls_scores) + + head_preds = self._merge_predict_results(bbox_preds_main, + objectnesses_main, + cls_scores_main) + head_preds_aux = self._merge_predict_results( + bbox_preds_aux, objectnesses_aux, cls_scores_aux) + else: + with_aux = False + batch_size = cls_scores[0].shape[0] + device = cls_scores[0].device + + head_preds = self._merge_predict_results(bbox_preds, objectnesses, + cls_scores) + + # Convert gt to norm xywh format + # (num_base_priors, num_batch_gt, 7) + # 7 is mean (batch_idx, cls_id, x_norm, y_norm, + # w_norm, h_norm, prior_idx) + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + scaled_factors = [ + torch.tensor(head_pred.shape, device=device)[[3, 2, 3, 2]] + for head_pred in head_preds + ] + + loss_cls, loss_obj, loss_box = self._calc_loss( + head_preds=head_preds, + head_preds_aux=None, + batch_targets_normed=batch_targets_normed, + near_neighbor_thr=self.near_neighbor_thr, + scaled_factors=scaled_factors, + batch_img_metas=batch_img_metas, + device=device) + + if with_aux: + loss_cls_aux, loss_obj_aux, loss_box_aux = self._calc_loss( + head_preds=head_preds, + head_preds_aux=head_preds_aux, + batch_targets_normed=batch_targets_normed, + near_neighbor_thr=self.near_neighbor_thr * 2, + scaled_factors=scaled_factors, + batch_img_metas=batch_img_metas, + device=device) + loss_cls += self.aux_loss_weights * loss_cls_aux + loss_obj += self.aux_loss_weights * loss_obj_aux + loss_box += self.aux_loss_weights * loss_box_aux + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) + + def _calc_loss(self, head_preds, head_preds_aux, batch_targets_normed, + near_neighbor_thr, scaled_factors, batch_img_metas, device): + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + + assigner_results = self.assigner( + head_preds, + batch_targets_normed, + batch_img_metas[0]['batch_input_shape'], + self.priors_base_sizes, + self.grid_offset, + near_neighbor_thr=near_neighbor_thr) + # mlvl is mean multi_level + mlvl_positive_infos = assigner_results['mlvl_positive_infos'] + mlvl_priors = assigner_results['mlvl_priors'] + mlvl_targets_normed = assigner_results['mlvl_targets_normed'] + + if head_preds_aux is not None: + # This is mean calc aux branch loss + head_preds = head_preds_aux + + for i, head_pred in enumerate(head_preds): + batch_inds, proir_idx, grid_x, grid_y = mlvl_positive_infos[i].T + num_pred_positive = batch_inds.shape[0] + target_obj = torch.zeros_like(head_pred[..., 0]) + # empty positive sampler + if num_pred_positive == 0: + loss_box += head_pred[..., :4].sum() * 0 + loss_cls += head_pred[..., 5:].sum() * 0 + loss_obj += self.loss_obj( + head_pred[..., 4], target_obj) * self.obj_level_weights[i] + continue + + priors = mlvl_priors[i] + targets_normed = mlvl_targets_normed[i] + + head_pred_positive = head_pred[batch_inds, proir_idx, grid_y, + grid_x] + + # calc bbox loss + grid_xy = torch.stack([grid_x, grid_y], dim=1) + decoded_pred_bbox = self._decode_bbox_to_xywh( + head_pred_positive[:, :4], priors, grid_xy) + target_bbox_scaled = targets_normed[:, 2:6] * scaled_factors[i] + + loss_box_i, iou = self.loss_bbox(decoded_pred_bbox, + target_bbox_scaled) + loss_box += loss_box_i + + # calc obj loss + target_obj[batch_inds, proir_idx, grid_y, + grid_x] = iou.detach().clamp(0).type(target_obj.dtype) + loss_obj += self.loss_obj(head_pred[..., 4], + target_obj) * self.obj_level_weights[i] + + # calc cls loss + if self.num_classes > 1: + pred_cls_scores = targets_normed[:, 1].long() + target_class = torch.full_like( + head_pred_positive[:, 5:], 0., device=device) + target_class[range(num_pred_positive), pred_cls_scores] = 1. + loss_cls += self.loss_cls(head_pred_positive[:, 5:], + target_class) + else: + loss_cls += head_pred_positive[:, 5:].sum() * 0 + return loss_cls, loss_obj, loss_box + + def _merge_predict_results(self, bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + cls_scores: Sequence[Tensor]) -> List[Tensor]: + """Merge predict output from 3 heads. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + + Returns: + List[Tensor]: Merged output. + """ + head_preds = [] + for bbox_pred, objectness, cls_score in zip(bbox_preds, objectnesses, + cls_scores): + b, _, h, w = bbox_pred.shape + bbox_pred = bbox_pred.reshape(b, self.num_base_priors, -1, h, w) + objectness = objectness.reshape(b, self.num_base_priors, -1, h, w) + cls_score = cls_score.reshape(b, self.num_base_priors, -1, h, w) + head_pred = torch.cat([bbox_pred, objectness, cls_score], + dim=2).permute(0, 1, 3, 4, 2).contiguous() + head_preds.append(head_pred) + return head_preds + + def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes, + grid_xy) -> Tensor: + bbox_pred = bbox_pred.sigmoid() + pred_xy = bbox_pred[:, :2] * 2 - 0.5 + grid_xy + pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes + decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1) + return decoded_bbox_pred diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov8_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov8_head.py new file mode 100644 index 0000000000000000000000000000000000000000..292024178ce2c249f63c9ce1168da767d9718fcf --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolov8_head.py @@ -0,0 +1,396 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.utils import multi_apply +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig) +from mmengine.dist import get_dist_info +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess, make_divisible +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class YOLOv8HeadModule(BaseModule): + """YOLOv8HeadModule head module used in `YOLOv8`. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to [8, 16, 32]. + reg_max (int): Max value of integral set :math: ``{0, ..., reg_max-1}`` + in QFL setting. Defaults to 16. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + featmap_strides: Sequence[int] = (8, 16, 32), + reg_max: int = 16, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.in_channels = in_channels + self.reg_max = reg_max + + in_channels = [] + for channel in self.in_channels: + channel = make_divisible(channel, widen_factor) + in_channels.append(channel) + self.in_channels = in_channels + + self._init_layers() + + def init_weights(self, prior_prob=0.01): + """Initialize the weight and bias of PPYOLOE head.""" + super().init_weights() + for reg_pred, cls_pred, stride in zip(self.reg_preds, self.cls_preds, + self.featmap_strides): + reg_pred[-1].bias.data[:] = 1.0 # box + # cls (.01 objects, 80 classes, 640 img) + cls_pred[-1].bias.data[:self.num_classes] = math.log( + 5 / self.num_classes / (640 / stride)**2) + + def _init_layers(self): + """initialize conv layers in YOLOv8 head.""" + # Init decouple head + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + + reg_out_channels = max( + (16, self.in_channels[0] // 4, self.reg_max * 4)) + cls_out_channels = max(self.in_channels[0], self.num_classes) + + for i in range(self.num_levels): + self.reg_preds.append( + nn.Sequential( + ConvModule( + in_channels=self.in_channels[i], + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + in_channels=reg_out_channels, + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + in_channels=reg_out_channels, + out_channels=4 * self.reg_max, + kernel_size=1))) + self.cls_preds.append( + nn.Sequential( + ConvModule( + in_channels=self.in_channels[i], + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + in_channels=cls_out_channels, + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + in_channels=cls_out_channels, + out_channels=self.num_classes, + kernel_size=1))) + + proj = torch.arange(self.reg_max, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.cls_preds, + self.reg_preds) + + def forward_single(self, x: torch.Tensor, cls_pred: nn.ModuleList, + reg_pred: nn.ModuleList) -> Tuple: + """Forward feature of a single scale level.""" + b, _, h, w = x.shape + cls_logit = cls_pred(x) + bbox_dist_preds = reg_pred(x) + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2) + + # TODO: The get_flops script cannot handle the situation of + # matmul, and needs to be fixed later + # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds + else: + return cls_logit, bbox_preds + + +@MODELS.register_module() +class YOLOv8Head(YOLOv5Head): + """YOLOv8Head head used in `YOLOv8`. + + Args: + head_module(:obj:`ConfigDict` or dict): Base module used for YOLOv8Head + prior_generator(dict): Points generator feature maps + in 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_dfl (:obj:`ConfigDict` or dict): Config of Distribution Focal + Loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none', + loss_weight=0.5), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xyxy', + reduction='sum', + loss_weight=7.5, + return_iou=False), + loss_dfl=dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=1.5 / 4), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + self.loss_dfl = MODELS.build(loss_dfl) + # YOLOv8 doesn't need loss_obj + self.loss_obj = None + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + + # Add common attributes to reduce calculation + self.featmap_sizes_train = None + self.num_level_priors = None + self.flatten_priors_train = None + self.stride_tensor = None + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, n, 4 * reg_max) + flatten_pred_dists = [ + bbox_pred_org.reshape(num_imgs, -1, self.head_module.reg_max * 4) + for bbox_pred_org in bbox_dist_preds + ] + + flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1) + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + + assigned_result = self.assigner( + (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype), + flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + assigned_scores_sum = assigned_scores.sum().clamp(min=1) + + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores).sum() + loss_cls /= assigned_scores_sum + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, assigned_bboxes_pos, + weight=bbox_weight) / assigned_scores_sum + + # dfl loss + pred_dist_pos = flatten_dist_preds[fg_mask_pre_prior] + assigned_ltrb = self.bbox_coder.encode( + self.flatten_priors_train[..., :2] / self.stride_tensor, + assigned_bboxes, + max_dis=self.head_module.reg_max - 1, + eps=0.01) + assigned_ltrb_pos = torch.masked_select( + assigned_ltrb, prior_bbox_mask).reshape([-1, 4]) + loss_dfl = self.loss_dfl( + pred_dist_pos.reshape(-1, self.head_module.reg_max), + assigned_ltrb_pos.reshape(-1), + weight=bbox_weight.expand(-1, 4).reshape(-1), + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + loss_dfl = flatten_pred_bboxes.sum() * 0 + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * num_imgs * world_size, + loss_bbox=loss_bbox * num_imgs * world_size, + loss_dfl=loss_dfl * num_imgs * world_size) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolox_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolox_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a203298d8536148a7022711eabeee7f04fea8ab4 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolox_head.py @@ -0,0 +1,514 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.task_modules.samplers import PseudoSampler +from mmdet.models.utils import multi_apply +from mmdet.structures.bbox import bbox_xyxy_to_cxcywh +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig, reduce_mean) +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class YOLOXHeadModule(BaseModule): + """YOLOXHead head module used in `YOLOX. + + ``_ + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid + stacked_convs (int): Number of stacking convs of the head. + Defaults to 2. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to [8, 16, 32]. + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Defaults to False. + dcn_on_last_conv (bool): If true, use dcn in the last layer of + towers. Defaults to False. + conv_bias (bool or str): If specified as `auto`, it will be decided by + the norm_cfg. Bias of conv will be set as True if `norm_cfg` is + None, otherwise False. Defaults to "auto". + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + use_depthwise: bool = False, + dcn_on_last_conv: bool = False, + conv_bias: Union[bool, str] = 'auto', + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + ): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.feat_channels = int(feat_channels * widen_factor) + self.stacked_convs = stacked_convs + self.use_depthwise = use_depthwise + self.dcn_on_last_conv = dcn_on_last_conv + assert conv_bias == 'auto' or isinstance(conv_bias, bool) + self.conv_bias = conv_bias + self.num_base_priors = num_base_priors + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.featmap_strides = featmap_strides + + if isinstance(in_channels, int): + in_channels = int(in_channels * widen_factor) + self.in_channels = in_channels + + self._init_layers() + + def _init_layers(self): + """Initialize heads for all level feature maps.""" + self.multi_level_cls_convs = nn.ModuleList() + self.multi_level_reg_convs = nn.ModuleList() + self.multi_level_conv_cls = nn.ModuleList() + self.multi_level_conv_reg = nn.ModuleList() + self.multi_level_conv_obj = nn.ModuleList() + for _ in self.featmap_strides: + self.multi_level_cls_convs.append(self._build_stacked_convs()) + self.multi_level_reg_convs.append(self._build_stacked_convs()) + conv_cls, conv_reg, conv_obj = self._build_predictor() + self.multi_level_conv_cls.append(conv_cls) + self.multi_level_conv_reg.append(conv_reg) + self.multi_level_conv_obj.append(conv_obj) + + def _build_stacked_convs(self) -> nn.Sequential: + """Initialize conv layers of a single level head.""" + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + stacked_convs = [] + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type='DCNv2') + else: + conv_cfg = self.conv_cfg + stacked_convs.append( + conv( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=self.conv_bias)) + return nn.Sequential(*stacked_convs) + + def _build_predictor(self) -> Tuple[nn.Module, nn.Module, nn.Module]: + """Initialize predictor layers of a single level head.""" + conv_cls = nn.Conv2d(self.feat_channels, self.num_classes, 1) + conv_reg = nn.Conv2d(self.feat_channels, 4, 1) + conv_obj = nn.Conv2d(self.feat_channels, 1, 1) + return conv_cls, conv_reg, conv_obj + + def init_weights(self): + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_obj in zip(self.multi_level_conv_cls, + self.multi_level_conv_obj): + conv_cls.bias.data.fill_(bias_init) + conv_obj.bias.data.fill_(bias_init) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + + return multi_apply(self.forward_single, x, self.multi_level_cls_convs, + self.multi_level_reg_convs, + self.multi_level_conv_cls, + self.multi_level_conv_reg, + self.multi_level_conv_obj) + + def forward_single(self, x: Tensor, cls_convs: nn.Module, + reg_convs: nn.Module, conv_cls: nn.Module, + conv_reg: nn.Module, + conv_obj: nn.Module) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + cls_feat = cls_convs(x) + reg_feat = reg_convs(x) + + cls_score = conv_cls(cls_feat) + bbox_pred = conv_reg(reg_feat) + objectness = conv_obj(reg_feat) + + return cls_score, bbox_pred, objectness + + +@MODELS.register_module() +class YOLOXHead(YOLOv5Head): + """YOLOXHead head used in `YOLOX `_. + + Args: + head_module(ConfigType): Base module used for YOLOXHead + prior_generator: Points generator feature maps in + 2D points-based detectors. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss. + loss_bbox_aux (:obj:`ConfigDict` or dict): Config of bbox aux loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='YOLOXBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmdet.IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_obj: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox_aux: ConfigType = dict( + type='mmdet.L1Loss', reduction='sum', loss_weight=1.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + self.use_bbox_aux = False + self.loss_bbox_aux = loss_bbox_aux + + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_obj=loss_obj, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + self.loss_bbox_aux: nn.Module = MODELS.build(self.loss_bbox_aux) + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + # YOLOX does not support sampling + self.sampler = PseudoSampler() + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + return self.head_module(x) + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Tensor, + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + batch_gt_instances = self.gt_instances_preprocess( + batch_gt_instances, len(batch_img_metas)) + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_objectness = torch.cat(flatten_objectness, dim=1) + flatten_priors = torch.cat(mlvl_priors) + flatten_bboxes = self.bbox_coder.decode(flatten_priors[..., :2], + flatten_bbox_preds, + flatten_priors[..., 2]) + + (pos_masks, cls_targets, obj_targets, bbox_targets, bbox_aux_target, + num_fg_imgs) = multi_apply( + self._get_targets_single, + flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1), + flatten_cls_preds.detach(), flatten_bboxes.detach(), + flatten_objectness.detach(), batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + + # The experimental results show that 'reduce_mean' can improve + # performance on the COCO dataset. + num_pos = torch.tensor( + sum(num_fg_imgs), + dtype=torch.float, + device=flatten_cls_preds.device) + num_total_samples = max(reduce_mean(num_pos), 1.0) + + pos_masks = torch.cat(pos_masks, 0) + cls_targets = torch.cat(cls_targets, 0) + obj_targets = torch.cat(obj_targets, 0) + bbox_targets = torch.cat(bbox_targets, 0) + if self.use_bbox_aux: + bbox_aux_target = torch.cat(bbox_aux_target, 0) + + loss_obj = self.loss_obj(flatten_objectness.view(-1, 1), + obj_targets) / num_total_samples + if num_pos > 0: + loss_cls = self.loss_cls( + flatten_cls_preds.view(-1, self.num_classes)[pos_masks], + cls_targets) / num_total_samples + loss_bbox = self.loss_bbox( + flatten_bboxes.view(-1, 4)[pos_masks], + bbox_targets) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_cls = flatten_cls_preds.sum() * 0 + loss_bbox = flatten_bboxes.sum() * 0 + + loss_dict = dict( + loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj) + + if self.use_bbox_aux: + if num_pos > 0: + loss_bbox_aux = self.loss_bbox_aux( + flatten_bbox_preds.view(-1, 4)[pos_masks], + bbox_aux_target) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_bbox_aux = flatten_bbox_preds.sum() * 0 + loss_dict.update(loss_bbox_aux=loss_bbox_aux) + + return loss_dict + + @torch.no_grad() + def _get_targets_single( + self, + priors: Tensor, + cls_preds: Tensor, + decoded_bboxes: Tensor, + objectness: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None) -> tuple: + """Compute classification, regression, and objectness targets for + priors in a single image. + + Args: + priors (Tensor): All priors of one image, a 2D-Tensor with shape + [num_priors, 4] in [cx, xy, stride_w, stride_y] format. + cls_preds (Tensor): Classification predictions of one image, + a 2D-Tensor with shape [num_priors, num_classes] + decoded_bboxes (Tensor): Decoded bboxes predictions of one image, + a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y, + br_x, br_y] format. + objectness (Tensor): Objectness predictions of one image, + a 1D-Tensor with shape [num_priors] + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + tuple: + foreground_mask (list[Tensor]): Binary mask of foreground + targets. + cls_target (list[Tensor]): Classification targets of an image. + obj_target (list[Tensor]): Objectness targets of an image. + bbox_target (list[Tensor]): BBox targets of an image. + bbox_aux_target (int): BBox aux targets of an image. + num_pos_per_img (int): Number of positive samples in an image. + """ + + num_priors = priors.size(0) + num_gts = len(gt_instances) + # No target + if num_gts == 0: + cls_target = cls_preds.new_zeros((0, self.num_classes)) + bbox_target = cls_preds.new_zeros((0, 4)) + bbox_aux_target = cls_preds.new_zeros((0, 4)) + obj_target = cls_preds.new_zeros((num_priors, 1)) + foreground_mask = cls_preds.new_zeros(num_priors).bool() + return (foreground_mask, cls_target, obj_target, bbox_target, + bbox_aux_target, 0) + + # YOLOX uses center priors with 0.5 offset to assign targets, + # but use center priors without offset to regress bboxes. + offset_priors = torch.cat( + [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1) + + scores = cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid() + pred_instances = InstanceData( + bboxes=decoded_bboxes, scores=scores.sqrt_(), priors=offset_priors) + assign_result = self.assigner.assign( + pred_instances=pred_instances, + gt_instances=gt_instances, + gt_instances_ignore=gt_instances_ignore) + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + pos_inds = sampling_result.pos_inds + num_pos_per_img = pos_inds.size(0) + + pos_ious = assign_result.max_overlaps[pos_inds] + # IOU aware classification score + cls_target = F.one_hot(sampling_result.pos_gt_labels, + self.num_classes) * pos_ious.unsqueeze(-1) + obj_target = torch.zeros_like(objectness).unsqueeze(-1) + obj_target[pos_inds] = 1 + bbox_target = sampling_result.pos_gt_bboxes + bbox_aux_target = cls_preds.new_zeros((num_pos_per_img, 4)) + if self.use_bbox_aux: + bbox_aux_target = self._get_bbox_aux_target( + bbox_aux_target, bbox_target, priors[pos_inds]) + foreground_mask = torch.zeros_like(objectness).to(torch.bool) + foreground_mask[pos_inds] = 1 + return (foreground_mask, cls_target, obj_target, bbox_target, + bbox_aux_target, num_pos_per_img) + + def _get_bbox_aux_target(self, + bbox_aux_target: Tensor, + gt_bboxes: Tensor, + priors: Tensor, + eps: float = 1e-8) -> Tensor: + """Convert gt bboxes to center offset and log width height.""" + gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes) + bbox_aux_target[:, :2] = (gt_cxcywh[:, :2] - + priors[:, :2]) / priors[:, 2:] + bbox_aux_target[:, + 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps) + return bbox_aux_target + + @staticmethod + def gt_instances_preprocess(batch_gt_instances: Tensor, + batch_size: int) -> List[InstanceData]: + """Split batch_gt_instances with batch size. + + Args: + batch_gt_instances (Tensor): Ground truth + a 2D-Tensor for whole batch, shape [all_gt_bboxes, 6] + batch_size (int): Batch size. + + Returns: + List: batch gt instances data, shape [batch_size, InstanceData] + """ + # faster version + batch_instance_list = [] + for i in range(batch_size): + batch_gt_instance_ = InstanceData() + single_batch_instance = \ + batch_gt_instances[batch_gt_instances[:, 0] == i, :] + batch_gt_instance_.bboxes = single_batch_instance[:, 2:] + batch_gt_instance_.labels = single_batch_instance[:, 1] + batch_instance_list.append(batch_gt_instance_) + + return batch_instance_list diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolox_pose_head.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolox_pose_head.py new file mode 100644 index 0000000000000000000000000000000000000000..96264e55299676239ce5a4c9b698941d0356bcea --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/dense_heads/yolox_pose_head.py @@ -0,0 +1,409 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import defaultdict +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.ops import batched_nms +from mmdet.models.utils import filter_scores_and_topk +from mmdet.utils import ConfigType, OptInstanceList +from mmengine.config import ConfigDict +from mmengine.model import ModuleList, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..utils import OutputSaveFunctionWrapper, OutputSaveObjectWrapper +from .yolox_head import YOLOXHead, YOLOXHeadModule + + +@MODELS.register_module() +class YOLOXPoseHeadModule(YOLOXHeadModule): + """YOLOXPoseHeadModule serves as a head module for `YOLOX-Pose`. + + In comparison to `YOLOXHeadModule`, this module introduces branches for + keypoint prediction. + """ + + def __init__(self, num_keypoints: int, *args, **kwargs): + self.num_keypoints = num_keypoints + super().__init__(*args, **kwargs) + + def _init_layers(self): + """Initializes the layers in the head module.""" + super()._init_layers() + + # The pose branch requires additional layers for precise regression + self.stacked_convs *= 2 + + # Create separate layers for each level of feature maps + pose_convs, offsets_preds, vis_preds = [], [], [] + for _ in self.featmap_strides: + pose_convs.append(self._build_stacked_convs()) + offsets_preds.append( + nn.Conv2d(self.feat_channels, self.num_keypoints * 2, 1)) + vis_preds.append( + nn.Conv2d(self.feat_channels, self.num_keypoints, 1)) + + self.multi_level_pose_convs = ModuleList(pose_convs) + self.multi_level_conv_offsets = ModuleList(offsets_preds) + self.multi_level_conv_vis = ModuleList(vis_preds) + + def init_weights(self): + """Initialize weights of the head.""" + super().init_weights() + + # Use prior in model initialization to improve stability + bias_init = bias_init_with_prob(0.01) + for conv_vis in self.multi_level_conv_vis: + conv_vis.bias.data.fill_(bias_init) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network.""" + offsets_pred, vis_pred = [], [] + for i in range(len(x)): + pose_feat = self.multi_level_pose_convs[i](x[i]) + offsets_pred.append(self.multi_level_conv_offsets[i](pose_feat)) + vis_pred.append(self.multi_level_conv_vis[i](pose_feat)) + return (*super().forward(x), offsets_pred, vis_pred) + + +@MODELS.register_module() +class YOLOXPoseHead(YOLOXHead): + """YOLOXPoseHead head used in `YOLO-Pose. + + `_. + Args: + loss_pose (ConfigDict, optional): Config of keypoint OKS loss. + """ + + def __init__( + self, + loss_pose: Optional[ConfigType] = None, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.loss_pose = MODELS.build(loss_pose) + self.num_keypoints = self.head_module.num_keypoints + + # set up buffers to save variables generated in methods of + # the class's base class. + self._log = defaultdict(list) + self.sampler = OutputSaveObjectWrapper(self.sampler) + + # ensure that the `sigmas` in self.assigner.oks_calculator + # is on the same device as the model + if hasattr(self.assigner, 'oks_calculator'): + self.add_module('assigner_oks_calculator', + self.assigner.oks_calculator) + + def _clear(self): + """Clear variable buffers.""" + self.sampler.clear() + self._log.clear() + + def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list, + dict]) -> dict: + + if isinstance(batch_data_samples, list): + losses = super().loss(x, batch_data_samples) + else: + outs = self(x) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['keypoints'], + batch_data_samples['keypoints_visible'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + kpt_preds: Sequence[Tensor], + vis_preds: Sequence[Tensor], + batch_gt_instances: Tensor, + batch_gt_keypoints: Tensor, + batch_gt_keypoints_visible: Tensor, + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + In addition to the base class method, keypoint losses are also + calculated in this method. + """ + + self._clear() + batch_gt_instances = self.gt_kps_instances_preprocess( + batch_gt_instances, batch_gt_keypoints, batch_gt_keypoints_visible, + len(batch_img_metas)) + + # collect keypoints coordinates and visibility from model predictions + kpt_preds = torch.cat([ + kpt_pred.flatten(2).permute(0, 2, 1).contiguous() + for kpt_pred in kpt_preds + ], + dim=1) + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + grid_priors = torch.cat(mlvl_priors) + + flatten_kpts = self.decode_pose(grid_priors[..., :2], kpt_preds, + grid_priors[..., 2]) + + vis_preds = torch.cat([ + vis_pred.flatten(2).permute(0, 2, 1).contiguous() + for vis_pred in vis_preds + ], + dim=1) + + # compute detection losses and collect targets for keypoints + # predictions simultaneously + self._log['pred_keypoints'] = list(flatten_kpts.detach().split( + 1, dim=0)) + self._log['pred_keypoints_vis'] = list(vis_preds.detach().split( + 1, dim=0)) + + losses = super().loss_by_feat(cls_scores, bbox_preds, objectnesses, + batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + + kpt_targets, vis_targets = [], [] + sampling_results = self.sampler.log['sample'] + sampling_result_idx = 0 + for gt_instances in batch_gt_instances: + if len(gt_instances) > 0: + sampling_result = sampling_results[sampling_result_idx] + kpt_target = gt_instances['keypoints'][ + sampling_result.pos_assigned_gt_inds] + vis_target = gt_instances['keypoints_visible'][ + sampling_result.pos_assigned_gt_inds] + sampling_result_idx += 1 + kpt_targets.append(kpt_target) + vis_targets.append(vis_target) + + if len(kpt_targets) > 0: + kpt_targets = torch.cat(kpt_targets, 0) + vis_targets = torch.cat(vis_targets, 0) + + # compute keypoint losses + if len(kpt_targets) > 0: + vis_targets = (vis_targets > 0).float() + pos_masks = torch.cat(self._log['foreground_mask'], 0) + bbox_targets = torch.cat(self._log['bbox_target'], 0) + loss_kpt = self.loss_pose( + flatten_kpts.view(-1, self.num_keypoints, 2)[pos_masks], + kpt_targets, vis_targets, bbox_targets) + loss_vis = self.loss_cls( + vis_preds.view(-1, self.num_keypoints)[pos_masks], + vis_targets) / vis_targets.sum() + else: + loss_kpt = kpt_preds.sum() * 0 + loss_vis = vis_preds.sum() * 0 + + losses.update(dict(loss_kpt=loss_kpt, loss_vis=loss_vis)) + + self._clear() + return losses + + @torch.no_grad() + def _get_targets_single( + self, + priors: Tensor, + cls_preds: Tensor, + decoded_bboxes: Tensor, + objectness: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None) -> tuple: + """Calculates targets for a single image, and saves them to the log. + + This method is similar to the _get_targets_single method in the base + class, but additionally saves the foreground mask and bbox targets to + the log. + """ + + # Construct a combined representation of bboxes and keypoints to + # ensure keypoints are also involved in the positive sample + # assignment process + kpt = self._log['pred_keypoints'].pop(0).squeeze(0) + kpt_vis = self._log['pred_keypoints_vis'].pop(0).squeeze(0) + kpt = torch.cat((kpt, kpt_vis.unsqueeze(-1)), dim=-1) + decoded_bboxes = torch.cat((decoded_bboxes, kpt.flatten(1)), dim=1) + + targets = super()._get_targets_single(priors, cls_preds, + decoded_bboxes, objectness, + gt_instances, img_meta, + gt_instances_ignore) + self._log['foreground_mask'].append(targets[0]) + self._log['bbox_target'].append(targets[3]) + return targets + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + kpt_preds: Optional[List[Tensor]] = None, + vis_preds: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into bbox + and keypoint results. + + In addition to the base class method, keypoint predictions are also + calculated in this method. + """ + """calculate predicted bboxes and get the kept instances indices. + + use OutputSaveFunctionWrapper as context manager to obtain + intermediate output from a parent class without copying a + arge block of code + """ + with OutputSaveFunctionWrapper( + filter_scores_and_topk, + super().predict_by_feat.__globals__) as outputs_1: + with OutputSaveFunctionWrapper( + batched_nms, + super()._bbox_post_process.__globals__) as outputs_2: + results_list = super().predict_by_feat(cls_scores, bbox_preds, + objectnesses, + batch_img_metas, cfg, + rescale, with_nms) + keep_indices_topk = [ + out[2][:cfg.max_per_img] for out in outputs_1 + ] + keep_indices_nms = [ + out[1][:cfg.max_per_img] for out in outputs_2 + ] + + num_imgs = len(batch_img_metas) + + # recover keypoints coordinates from model predictions + featmap_sizes = [vis_pred.shape[2:] for vis_pred in vis_preds] + priors = torch.cat(self.mlvl_priors) + strides = [ + priors.new_full((featmap_size.numel() * self.num_base_priors, ), + stride) for featmap_size, stride in zip( + featmap_sizes, self.featmap_strides) + ] + strides = torch.cat(strides) + kpt_preds = torch.cat([ + kpt_pred.permute(0, 2, 3, 1).reshape( + num_imgs, -1, self.num_keypoints * 2) for kpt_pred in kpt_preds + ], + dim=1) + flatten_decoded_kpts = self.decode_pose(priors, kpt_preds, strides) + + vis_preds = torch.cat([ + vis_pred.permute(0, 2, 3, 1).reshape( + num_imgs, -1, self.num_keypoints) for vis_pred in vis_preds + ], + dim=1).sigmoid() + + # select keypoints predictions according to bbox scores and nms result + keep_indices_nms_idx = 0 + for pred_instances, kpts, kpts_vis, img_meta, keep_idxs \ + in zip( + results_list, flatten_decoded_kpts, vis_preds, + batch_img_metas, keep_indices_topk): + + pred_instances.bbox_scores = pred_instances.scores + + if len(pred_instances) == 0: + pred_instances.keypoints = kpts[:0] + pred_instances.keypoint_scores = kpts_vis[:0] + continue + + kpts = kpts[keep_idxs] + kpts_vis = kpts_vis[keep_idxs] + + if rescale: + pad_param = img_meta.get('img_meta', None) + scale_factor = img_meta['scale_factor'] + if pad_param is not None: + kpts -= kpts.new_tensor([pad_param[2], pad_param[0]]) + kpts /= kpts.new_tensor(scale_factor).repeat( + (1, self.num_keypoints, 1)) + + keep_idxs_nms = keep_indices_nms[keep_indices_nms_idx] + kpts = kpts[keep_idxs_nms] + kpts_vis = kpts_vis[keep_idxs_nms] + keep_indices_nms_idx += 1 + + pred_instances.keypoints = kpts + pred_instances.keypoint_scores = kpts_vis + + results_list = [r.numpy() for r in results_list] + return results_list + + def decode_pose(self, grids: torch.Tensor, offsets: torch.Tensor, + strides: Union[torch.Tensor, int]) -> torch.Tensor: + """Decode regression offsets to keypoints. + + Args: + grids (torch.Tensor): The coordinates of the feature map grids. + offsets (torch.Tensor): The predicted offset of each keypoint + relative to its corresponding grid. + strides (torch.Tensor | int): The stride of the feature map for + each instance. + Returns: + torch.Tensor: The decoded keypoints coordinates. + """ + + if isinstance(strides, int): + strides = torch.tensor([strides]).to(offsets) + + strides = strides.reshape(1, -1, 1, 1) + offsets = offsets.reshape(*offsets.shape[:2], -1, 2) + xy_coordinates = (offsets[..., :2] * strides) + grids.unsqueeze(1) + return xy_coordinates + + @staticmethod + def gt_kps_instances_preprocess(batch_gt_instances: Tensor, + batch_gt_keypoints, + batch_gt_keypoints_visible, + batch_size: int) -> List[InstanceData]: + """Split batch_gt_instances with batch size. + + Args: + batch_gt_instances (Tensor): Ground truth + a 2D-Tensor for whole batch, shape [all_gt_bboxes, 6] + batch_size (int): Batch size. + + Returns: + List: batch gt instances data, shape [batch_size, InstanceData] + """ + # faster version + batch_instance_list = [] + for i in range(batch_size): + batch_gt_instance_ = InstanceData() + single_batch_instance = \ + batch_gt_instances[batch_gt_instances[:, 0] == i, :] + keypoints = \ + batch_gt_keypoints[batch_gt_instances[:, 0] == i, :] + keypoints_visible = \ + batch_gt_keypoints_visible[batch_gt_instances[:, 0] == i, :] + batch_gt_instance_.bboxes = single_batch_instance[:, 2:] + batch_gt_instance_.labels = single_batch_instance[:, 1] + batch_gt_instance_.keypoints = keypoints + batch_gt_instance_.keypoints_visible = keypoints_visible + batch_instance_list.append(batch_gt_instance_) + + return batch_instance_list + + @staticmethod + def gt_instances_preprocess(batch_gt_instances: List[InstanceData], *args, + **kwargs) -> List[InstanceData]: + return batch_gt_instances diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/detectors/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..74fb1c6c21c5840a5cd3f45a1a9f827c0e670604 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/detectors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .yolo_detector import YOLODetector + +__all__ = ['YOLODetector'] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/detectors/yolo_detector.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/detectors/yolo_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..e6783fbab41287df54f136ea121e827d0603414f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/detectors/yolo_detector.py @@ -0,0 +1,53 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmdet.models.detectors.single_stage import SingleStageDetector +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from mmengine.dist import get_world_size +from mmengine.logging import print_log + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLODetector(SingleStageDetector): + r"""Implementation of YOLO Series + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of YOLO. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of YOLO. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + use_syncbn (bool): whether to use SyncBatchNorm. Defaults to True. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None, + use_syncbn: bool = True): + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + + # TODO: Waiting for mmengine support + if use_syncbn and get_world_size() > 1: + torch.nn.SyncBatchNorm.convert_sync_batchnorm(self) + print_log('Using SyncBatchNorm()', 'current') diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/layers/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..02753057f2ddf51b0688f4f65ebc52e12be9fa7a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/layers/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ema import ExpMomentumEMA +from .yolo_bricks import (BepC3StageBlock, BiFusion, CSPLayerWithTwoConv, + DarknetBottleneck, EELANBlock, EffectiveSELayer, + ELANBlock, ImplicitA, ImplicitM, + MaxPoolAndStrideConvBlock, PPYOLOEBasicBlock, + RepStageBlock, RepVGGBlock, SPPFBottleneck, + SPPFCSPBlock, TinyDownSampleBlock) + +__all__ = [ + 'SPPFBottleneck', 'RepVGGBlock', 'RepStageBlock', 'ExpMomentumEMA', + 'ELANBlock', 'MaxPoolAndStrideConvBlock', 'SPPFCSPBlock', + 'PPYOLOEBasicBlock', 'EffectiveSELayer', 'TinyDownSampleBlock', + 'EELANBlock', 'ImplicitA', 'ImplicitM', 'BepC3StageBlock', + 'CSPLayerWithTwoConv', 'DarknetBottleneck', 'BiFusion' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/layers/ema.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/layers/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..02ed204190ee4a5ab9395eddce5866545caac2c0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/layers/ema.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import torch +import torch.nn as nn +from mmdet.models.layers import ExpMomentumEMA as MMDET_ExpMomentumEMA +from torch import Tensor + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class ExpMomentumEMA(MMDET_ExpMomentumEMA): + """Exponential moving average (EMA) with exponential momentum strategy, + which is used in YOLO. + + Args: + model (nn.Module): The model to be averaged. + momentum (float): The momentum used for updating ema parameter. + Ema's parameters are updated with the formula: + `averaged_param = (1-momentum) * averaged_param + momentum * + source_param`. Defaults to 0.0002. + gamma (int): Use a larger momentum early in training and gradually + annealing to a smaller value to update the ema model smoothly. The + momentum is calculated as + `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`. + Defaults to 2000. + interval (int): Interval between two updates. Defaults to 1. + device (torch.device, optional): If provided, the averaged model will + be stored on the :attr:`device`. Defaults to None. + update_buffers (bool): if True, it will compute running averages for + both the parameters and the buffers of the model. Defaults to + False. + """ + + def __init__(self, + model: nn.Module, + momentum: float = 0.0002, + gamma: int = 2000, + interval=1, + device: Optional[torch.device] = None, + update_buffers: bool = False): + super().__init__( + model=model, + momentum=momentum, + interval=interval, + device=device, + update_buffers=update_buffers) + assert gamma > 0, f'gamma must be greater than 0, but got {gamma}' + self.gamma = gamma + + # Note: There is no need to re-fetch every update, + # as most models do not change their structure + # during the training process. + self.src_parameters = ( + model.state_dict() + if self.update_buffers else dict(model.named_parameters())) + if not self.update_buffers: + self.src_buffers = model.buffers() + + def avg_func(self, averaged_param: Tensor, source_param: Tensor, + steps: int): + """Compute the moving average of the parameters using the exponential + momentum strategy. + + Args: + averaged_param (Tensor): The averaged parameters. + source_param (Tensor): The source parameters. + steps (int): The number of times the parameters have been + updated. + """ + momentum = (1 - self.momentum) * math.exp( + -float(1 + steps) / self.gamma) + self.momentum + averaged_param.lerp_(source_param, momentum) + + def update_parameters(self, model: nn.Module): + """Update the parameters after each training step. + + Args: + model (nn.Module): The model of the parameter needs to be updated. + """ + if self.steps == 0: + for k, p_avg in self.avg_parameters.items(): + p_avg.data.copy_(self.src_parameters[k].data) + elif self.steps % self.interval == 0: + for k, p_avg in self.avg_parameters.items(): + if p_avg.dtype.is_floating_point: + self.avg_func(p_avg.data, self.src_parameters[k].data, + self.steps) + if not self.update_buffers: + # If not update the buffers, + # keep the buffers in sync with the source model. + for b_avg, b_src in zip(self.module.buffers(), self.src_buffers): + b_avg.data.copy_(b_src.data) + self.steps += 1 diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/layers/yolo_bricks.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/layers/yolo_bricks.py new file mode 100644 index 0000000000000000000000000000000000000000..19175be1a0e88f5bb7fb87b6810c52050293d890 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/layers/yolo_bricks.py @@ -0,0 +1,1728 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, MaxPool2d, + build_norm_layer) +from mmdet.models.layers.csp_layer import \ + DarknetBottleneck as MMDET_DarknetBottleneck +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from mmengine.model import BaseModule +from mmengine.utils import digit_version +from torch import Tensor + +from mmyolo.registry import MODELS + +if digit_version(torch.__version__) >= digit_version('1.7.0'): + MODELS.register_module(module=nn.SiLU, name='SiLU') +else: + + class SiLU(nn.Module): + """Sigmoid Weighted Liner Unit.""" + + def __init__(self, inplace=True): + super().__init__() + + def forward(self, inputs) -> Tensor: + return inputs * torch.sigmoid(inputs) + + MODELS.register_module(module=SiLU, name='SiLU') + + +class SPPFBottleneck(BaseModule): + """Spatial pyramid pooling - Fast (SPPF) layer for + YOLOv5, YOLOX and PPYOLOE by Glenn Jocher + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 5. + use_conv_first (bool): Whether to use conv before pooling layer. + In YOLOv5 and YOLOX, the para set to True. + In PPYOLOE, the para set to False. + Defaults to True. + mid_channels_scale (float): Channel multiplier, multiply in_channels + by this amount to get mid_channels. This parameter is valid only + when use_conv_fist=True.Defaults to 0.5. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_sizes: Union[int, Sequence[int]] = 5, + use_conv_first: bool = True, + mid_channels_scale: float = 0.5, + conv_cfg: ConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + + if use_conv_first: + mid_channels = int(in_channels * mid_channels_scale) + self.conv1 = ConvModule( + in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + mid_channels = in_channels + self.conv1 = None + self.kernel_sizes = kernel_sizes + if isinstance(kernel_sizes, int): + self.poolings = nn.MaxPool2d( + kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2) + conv2_in_channels = mid_channels * 4 + else: + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_in_channels = mid_channels * (len(kernel_sizes) + 1) + + self.conv2 = ConvModule( + conv2_in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + if self.conv1: + x = self.conv1(x) + if isinstance(self.kernel_sizes, int): + y1 = self.poolings(x) + y2 = self.poolings(y1) + x = torch.cat([x, y1, y2, self.poolings(y2)], dim=1) + else: + x = torch.cat( + [x] + [pooling(x) for pooling in self.poolings], dim=1) + x = self.conv2(x) + return x + + +@MODELS.register_module() +class RepVGGBlock(nn.Module): + """RepVGGBlock is a basic rep-style block, including training and deploy + status This code is based on + https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple): Stride of the convolution. Default: 1 + padding (int, tuple): Padding added to all four sides of + the input. Default: 1 + dilation (int or tuple): Spacing between kernel elements. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + padding_mode (string, optional): Default: 'zeros' + use_se (bool): Whether to use se. Default: False + use_alpha (bool): Whether to use `alpha` parameter at 1x1 conv. + In PPYOLOE+ model backbone, `use_alpha` will be set to True. + Default: False. + use_bn_first (bool): Whether to use bn layer before conv. + In YOLOv6 and YOLOv7, this will be set to True. + In PPYOLOE, this will be set to False. + Default: True. + deploy (bool): Whether in deploy mode. Default: False + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int]] = 3, + stride: Union[int, Tuple[int]] = 1, + padding: Union[int, Tuple[int]] = 1, + dilation: Union[int, Tuple[int]] = 1, + groups: Optional[int] = 1, + padding_mode: Optional[str] = 'zeros', + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + use_se: bool = False, + use_alpha: bool = False, + use_bn_first=True, + deploy: bool = False): + super().__init__() + self.deploy = deploy + self.groups = groups + self.in_channels = in_channels + self.out_channels = out_channels + + assert kernel_size == 3 + assert padding == 1 + + padding_11 = padding - kernel_size // 2 + + self.nonlinearity = MODELS.build(act_cfg) + + if use_se: + raise NotImplementedError('se block not supported yet') + else: + self.se = nn.Identity() + + if use_alpha: + alpha = torch.ones([ + 1, + ], dtype=torch.float32, requires_grad=True) + self.alpha = nn.Parameter(alpha, requires_grad=True) + else: + self.alpha = None + + if deploy: + self.rbr_reparam = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=True, + padding_mode=padding_mode) + + else: + if use_bn_first and (out_channels == in_channels) and stride == 1: + self.rbr_identity = build_norm_layer( + norm_cfg, num_features=in_channels)[1] + else: + self.rbr_identity = None + + self.rbr_dense = ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None) + self.rbr_1x1 = ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + padding=padding_11, + groups=groups, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, inputs: Tensor) -> Tensor: + """Forward process. + Args: + inputs (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + if hasattr(self, 'rbr_reparam'): + return self.nonlinearity(self.se(self.rbr_reparam(inputs))) + + if self.rbr_identity is None: + id_out = 0 + else: + id_out = self.rbr_identity(inputs) + if self.alpha: + return self.nonlinearity( + self.se( + self.rbr_dense(inputs) + + self.alpha * self.rbr_1x1(inputs) + id_out)) + else: + return self.nonlinearity( + self.se( + self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)) + + def get_equivalent_kernel_bias(self): + """Derives the equivalent kernel and bias in a differentiable way. + + Returns: + tuple: Equivalent kernel and bias + """ + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) + kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) + if self.alpha: + return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor( + kernel1x1) + kernelid, bias3x3 + self.alpha * bias1x1 + biasid + else: + return kernel3x3 + self._pad_1x1_to_3x3_tensor( + kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + """Pad 1x1 tensor to 3x3. + Args: + kernel1x1 (Tensor): The input 1x1 kernel need to be padded. + + Returns: + Tensor: 3x3 kernel after padded. + """ + if kernel1x1 is None: + return 0 + else: + return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch: nn.Module) -> Tuple[np.ndarray, Tensor]: + """Derives the equivalent kernel and bias of a specific branch layer. + + Args: + branch (nn.Module): The layer that needs to be equivalently + transformed, which can be nn.Sequential or nn.Batchnorm2d + + Returns: + tuple: Equivalent kernel and bias + """ + if branch is None: + return 0, 0 + if isinstance(branch, ConvModule): + kernel = branch.conv.weight + running_mean = branch.bn.running_mean + running_var = branch.bn.running_var + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn.eps + else: + assert isinstance(branch, (nn.SyncBatchNorm, nn.BatchNorm2d)) + if not hasattr(self, 'id_tensor'): + input_dim = self.in_channels // self.groups + kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), + dtype=np.float32) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, 1, 1] = 1 + self.id_tensor = torch.from_numpy(kernel_value).to( + branch.weight.device) + kernel = self.id_tensor + running_mean = branch.running_mean + running_var = branch.running_var + gamma = branch.weight + beta = branch.bias + eps = branch.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def switch_to_deploy(self): + """Switch to deploy mode.""" + if hasattr(self, 'rbr_reparam'): + return + kernel, bias = self.get_equivalent_kernel_bias() + self.rbr_reparam = nn.Conv2d( + in_channels=self.rbr_dense.conv.in_channels, + out_channels=self.rbr_dense.conv.out_channels, + kernel_size=self.rbr_dense.conv.kernel_size, + stride=self.rbr_dense.conv.stride, + padding=self.rbr_dense.conv.padding, + dilation=self.rbr_dense.conv.dilation, + groups=self.rbr_dense.conv.groups, + bias=True) + self.rbr_reparam.weight.data = kernel + self.rbr_reparam.bias.data = bias + for para in self.parameters(): + para.detach_() + self.__delattr__('rbr_dense') + self.__delattr__('rbr_1x1') + if hasattr(self, 'rbr_identity'): + self.__delattr__('rbr_identity') + if hasattr(self, 'id_tensor'): + self.__delattr__('id_tensor') + self.deploy = True + + +@MODELS.register_module() +class BepC3StageBlock(nn.Module): + """Beer-mug RepC3 Block. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + num_blocks (int): Number of blocks. Defaults to 1 + hidden_ratio (float): Hidden channel expansion. + Default: 0.5 + concat_all_layer (bool): Concat all layer when forward calculate. + Default: True + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + norm_cfg (ConfigType): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (ConfigType): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + num_blocks: int = 1, + hidden_ratio: float = 0.5, + concat_all_layer: bool = True, + block_cfg: ConfigType = dict(type='RepVGGBlock'), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True)): + super().__init__() + hidden_channels = int(out_channels * hidden_ratio) + + self.conv1 = ConvModule( + in_channels, + hidden_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = ConvModule( + in_channels, + hidden_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + 2 * hidden_channels, + out_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.block = RepStageBlock( + in_channels=hidden_channels, + out_channels=hidden_channels, + num_blocks=num_blocks, + block_cfg=block_cfg, + bottle_block=BottleRep) + self.concat_all_layer = concat_all_layer + if not concat_all_layer: + self.conv3 = ConvModule( + hidden_channels, + out_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x): + if self.concat_all_layer is True: + return self.conv3( + torch.cat((self.block(self.conv1(x)), self.conv2(x)), dim=1)) + else: + return self.conv3(self.block(self.conv1(x))) + + +class BottleRep(nn.Module): + """Bottle Rep Block. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + adaptive_weight (bool): Add adaptive_weight when forward calculate. + Defaults False. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + block_cfg: ConfigType = dict(type='RepVGGBlock'), + adaptive_weight: bool = False): + super().__init__() + conv1_cfg = block_cfg.copy() + conv2_cfg = block_cfg.copy() + + conv1_cfg.update( + dict(in_channels=in_channels, out_channels=out_channels)) + conv2_cfg.update( + dict(in_channels=out_channels, out_channels=out_channels)) + + self.conv1 = MODELS.build(conv1_cfg) + self.conv2 = MODELS.build(conv2_cfg) + + if in_channels != out_channels: + self.shortcut = False + else: + self.shortcut = True + if adaptive_weight: + self.alpha = nn.Parameter(torch.ones(1)) + else: + self.alpha = 1.0 + + def forward(self, x: Tensor) -> Tensor: + outputs = self.conv1(x) + outputs = self.conv2(outputs) + return outputs + self.alpha * x if self.shortcut else outputs + + +@MODELS.register_module() +class ConvWrapper(nn.Module): + """Wrapper for normal Conv with SiLU activation. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple): Stride of the convolution. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): Conv bias. Default: True. + norm_cfg (ConfigType): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (ConfigType): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int = 3, + stride: int = 1, + groups: int = 1, + bias: bool = True, + norm_cfg: ConfigType = None, + act_cfg: ConfigType = dict(type='SiLU')): + super().__init__() + self.block = ConvModule( + in_channels, + out_channels, + kernel_size, + stride, + padding=kernel_size // 2, + groups=groups, + bias=bias, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + return self.block(x) + + +@MODELS.register_module() +class EffectiveSELayer(nn.Module): + """Effective Squeeze-Excitation. + + From `CenterMask : Real-Time Anchor-Free Instance Segmentation` + arxiv (https://arxiv.org/abs/1911.06667) + This code referenced to + https://github.com/youngwanLEE/CenterMask/blob/72147e8aae673fcaf4103ee90a6a6b73863e7fa1/maskrcnn_benchmark/modeling/backbone/vovnet.py#L108-L121 # noqa + + Args: + channels (int): The input and output channels of this Module. + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='HSigmoid'). + """ + + def __init__(self, + channels: int, + act_cfg: ConfigType = dict(type='HSigmoid')): + super().__init__() + assert isinstance(act_cfg, dict) + self.fc = ConvModule(channels, channels, 1, act_cfg=None) + + act_cfg_ = act_cfg.copy() # type: ignore + self.activate = MODELS.build(act_cfg_) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x_se = x.mean((2, 3), keepdim=True) + x_se = self.fc(x_se) + return x * self.activate(x_se) + + +class PPYOLOESELayer(nn.Module): + """Squeeze-and-Excitation Attention Module for PPYOLOE. + There are some differences between the current implementation and + SELayer in mmdet: + 1. For fast speed and avoiding double inference in ppyoloe, + use `F.adaptive_avg_pool2d` before PPYOLOESELayer. + 2. Special ways to init weights. + 3. Different convolution order. + + Args: + feat_channels (int): The input (and output) channels of the SE layer. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + """ + + def __init__(self, + feat_channels: int, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True)): + super().__init__() + self.fc = nn.Conv2d(feat_channels, feat_channels, 1) + self.sig = nn.Sigmoid() + self.conv = ConvModule( + feat_channels, + feat_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self._init_weights() + + def _init_weights(self): + """Init weights.""" + nn.init.normal_(self.fc.weight, mean=0, std=0.001) + + def forward(self, feat: Tensor, avg_feat: Tensor) -> Tensor: + """Forward process + Args: + feat (Tensor): The input tensor. + avg_feat (Tensor): Average pooling feature tensor. + """ + weight = self.sig(self.fc(avg_feat)) + return self.conv(feat * weight) + + +@MODELS.register_module() +class ELANBlock(BaseModule): + """Efficient layer aggregation networks for YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The out channels of this Module. + middle_ratio (float): The scaling ratio of the middle layer + based on the in_channels. + block_ratio (float): The scaling ratio of the block layer + based on the in_channels. + num_blocks (int): The number of blocks in the main branch. + Defaults to 2. + num_convs_in_block (int): The number of convs pre block. + Defaults to 1. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + middle_ratio: float, + block_ratio: float, + num_blocks: int = 2, + num_convs_in_block: int = 1, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + assert num_blocks >= 1 + assert num_convs_in_block >= 1 + + middle_channels = int(in_channels * middle_ratio) + block_channels = int(in_channels * block_ratio) + final_conv_in_channels = int( + num_blocks * block_channels) + 2 * middle_channels + + self.main_conv = ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.short_conv = ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = nn.ModuleList() + for _ in range(num_blocks): + if num_convs_in_block == 1: + internal_block = ConvModule( + middle_channels, + block_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + internal_block = [] + for _ in range(num_convs_in_block): + internal_block.append( + ConvModule( + middle_channels, + block_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + middle_channels = block_channels + internal_block = nn.Sequential(*internal_block) + + middle_channels = block_channels + self.blocks.append(internal_block) + + self.final_conv = ConvModule( + final_conv_in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x_short = self.short_conv(x) + x_main = self.main_conv(x) + block_outs = [] + x_block = x_main + for block in self.blocks: + x_block = block(x_block) + block_outs.append(x_block) + x_final = torch.cat((*block_outs[::-1], x_main, x_short), dim=1) + return self.final_conv(x_final) + + +@MODELS.register_module() +class EELANBlock(BaseModule): + """Expand efficient layer aggregation networks for YOLOv7. + + Args: + num_elan_block (int): The number of ELANBlock. + """ + + def __init__(self, num_elan_block: int, **kwargs): + super().__init__() + assert num_elan_block >= 1 + self.e_elan_blocks = nn.ModuleList() + for _ in range(num_elan_block): + self.e_elan_blocks.append(ELANBlock(**kwargs)) + + def forward(self, x: Tensor) -> Tensor: + outs = [] + for elan_blocks in self.e_elan_blocks: + outs.append(elan_blocks(x)) + return sum(outs) + + +class MaxPoolAndStrideConvBlock(BaseModule): + """Max pooling and stride conv layer for YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The out channels of this Module. + maxpool_kernel_sizes (int): kernel sizes of pooling layers. + Defaults to 2. + use_in_channels_of_middle (bool): Whether to calculate middle channels + based on in_channels. Defaults to False. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + maxpool_kernel_sizes: int = 2, + use_in_channels_of_middle: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + middle_channels = in_channels if use_in_channels_of_middle \ + else out_channels // 2 + + self.maxpool_branches = nn.Sequential( + MaxPool2d( + kernel_size=maxpool_kernel_sizes, stride=maxpool_kernel_sizes), + ConvModule( + in_channels, + out_channels // 2, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.stride_conv_branches = nn.Sequential( + ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + middle_channels, + out_channels // 2, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + maxpool_out = self.maxpool_branches(x) + stride_conv_out = self.stride_conv_branches(x) + return torch.cat([stride_conv_out, maxpool_out], dim=1) + + +@MODELS.register_module() +class TinyDownSampleBlock(BaseModule): + """Down sample layer for YOLOv7-tiny. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The out channels of this Module. + middle_ratio (float): The scaling ratio of the middle layer + based on the in_channels. Defaults to 1.0. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 3. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + middle_ratio: float = 1.0, + kernel_sizes: Union[int, Sequence[int]] = 3, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='LeakyReLU', negative_slope=0.1), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + + middle_channels = int(in_channels * middle_ratio) + + self.short_conv = ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.main_convs = nn.ModuleList() + for i in range(3): + if i == 0: + self.main_convs.append( + ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + else: + self.main_convs.append( + ConvModule( + middle_channels, + middle_channels, + kernel_sizes, + padding=(kernel_sizes - 1) // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.final_conv = ConvModule( + middle_channels * 4, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x) -> Tensor: + short_out = self.short_conv(x) + + main_outs = [] + for main_conv in self.main_convs: + main_out = main_conv(x) + main_outs.append(main_out) + x = main_out + + return self.final_conv(torch.cat([*main_outs[::-1], short_out], dim=1)) + + +@MODELS.register_module() +class SPPFCSPBlock(BaseModule): + """Spatial pyramid pooling - Fast (SPPF) layer with CSP for + YOLOv7 + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expand_ratio (float): Expand ratio of SPPCSPBlock. + Defaults to 0.5. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 5. + is_tiny_version (bool): Is tiny version of SPPFCSPBlock. If True, + it means it is a yolov7 tiny model. Defaults to False. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expand_ratio: float = 0.5, + kernel_sizes: Union[int, Sequence[int]] = 5, + is_tiny_version: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + self.is_tiny_version = is_tiny_version + + mid_channels = int(2 * out_channels * expand_ratio) + + if is_tiny_version: + self.main_layers = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.main_layers = nn.Sequential( + ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + mid_channels, + mid_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + mid_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ) + + self.kernel_sizes = kernel_sizes + if isinstance(kernel_sizes, int): + self.poolings = nn.MaxPool2d( + kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2) + else: + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + + if is_tiny_version: + self.fuse_layers = ConvModule( + 4 * mid_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.fuse_layers = nn.Sequential( + ConvModule( + 4 * mid_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + mid_channels, + mid_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.short_layer = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.final_conv = ConvModule( + 2 * mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x1 = self.main_layers(x) + if isinstance(self.kernel_sizes, int): + y1 = self.poolings(x1) + y2 = self.poolings(y1) + concat_list = [x1] + [y1, y2, self.poolings(y2)] + if self.is_tiny_version: + x1 = self.fuse_layers(torch.cat(concat_list[::-1], 1)) + else: + x1 = self.fuse_layers(torch.cat(concat_list, 1)) + else: + concat_list = [x1] + [m(x1) for m in self.poolings] + if self.is_tiny_version: + x1 = self.fuse_layers(torch.cat(concat_list[::-1], 1)) + else: + x1 = self.fuse_layers(torch.cat(concat_list, 1)) + + x2 = self.short_layer(x) + return self.final_conv(torch.cat((x1, x2), dim=1)) + + +class ImplicitA(nn.Module): + """Implicit add layer in YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + mean (float): Mean value of implicit module. Defaults to 0. + std (float): Std value of implicit module. Defaults to 0.02 + """ + + def __init__(self, in_channels: int, mean: float = 0., std: float = .02): + super().__init__() + self.implicit = nn.Parameter(torch.zeros(1, in_channels, 1, 1)) + nn.init.normal_(self.implicit, mean=mean, std=std) + + def forward(self, x): + """Forward process + Args: + x (Tensor): The input tensor. + """ + return self.implicit + x + + +class ImplicitM(nn.Module): + """Implicit multiplier layer in YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + mean (float): Mean value of implicit module. Defaults to 1. + std (float): Std value of implicit module. Defaults to 0.02. + """ + + def __init__(self, in_channels: int, mean: float = 1., std: float = .02): + super().__init__() + self.implicit = nn.Parameter(torch.ones(1, in_channels, 1, 1)) + nn.init.normal_(self.implicit, mean=mean, std=std) + + def forward(self, x): + """Forward process + Args: + x (Tensor): The input tensor. + """ + return self.implicit * x + + +@MODELS.register_module() +class PPYOLOEBasicBlock(nn.Module): + """PPYOLOE Backbone BasicBlock. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + shortcut (bool): Whether to add inputs and outputs together + at the end of this layer. Defaults to True. + use_alpha (bool): Whether to use `alpha` parameter at 1x1 conv. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + shortcut: bool = True, + use_alpha: bool = False): + super().__init__() + assert act_cfg is None or isinstance(act_cfg, dict) + self.conv1 = ConvModule( + in_channels, + out_channels, + 3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.conv2 = RepVGGBlock( + out_channels, + out_channels, + use_alpha=use_alpha, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + use_bn_first=False) + self.shortcut = shortcut + + def forward(self, x: Tensor) -> Tensor: + """Forward process. + Args: + inputs (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + y = self.conv1(x) + y = self.conv2(y) + if self.shortcut: + return x + y + else: + return y + + +class CSPResLayer(nn.Module): + """PPYOLOE Backbone Stage. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + num_block (int): Number of blocks in this stage. + block_cfg (dict): Config dict for block. Default config is + suitable for PPYOLOE+ backbone. And in PPYOLOE neck, + block_cfg is set to dict(type='PPYOLOEBasicBlock', + shortcut=False, use_alpha=False). Defaults to + dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True). + stride (int): Stride of the convolution. In backbone, the stride + must be set to 2. In neck, the stride must be set to 1. + Defaults to 1. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + attention_cfg (dict, optional): Config dict for `EffectiveSELayer`. + Defaults to dict(type='EffectiveSELayer', + act_cfg=dict(type='HSigmoid')). + use_spp (bool): Whether to use `SPPFBottleneck` layer. + Defaults to False. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + num_block: int, + block_cfg: ConfigType = dict( + type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True), + stride: int = 1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + attention_cfg: OptMultiConfig = dict( + type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')), + use_spp: bool = False): + super().__init__() + + self.num_block = num_block + self.block_cfg = block_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.use_spp = use_spp + assert attention_cfg is None or isinstance(attention_cfg, dict) + + if stride == 2: + conv1_in_channels = conv2_in_channels = conv3_in_channels = ( + in_channels + out_channels) // 2 + blocks_channels = conv1_in_channels // 2 + self.conv_down = ConvModule( + in_channels, + conv1_in_channels, + 3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + conv1_in_channels = conv2_in_channels = in_channels + conv3_in_channels = out_channels + blocks_channels = out_channels // 2 + self.conv_down = None + + self.conv1 = ConvModule( + conv1_in_channels, + blocks_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.conv2 = ConvModule( + conv2_in_channels, + blocks_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = self.build_blocks_layer(blocks_channels) + + self.conv3 = ConvModule( + conv3_in_channels, + out_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if attention_cfg: + attention_cfg = attention_cfg.copy() + attention_cfg['channels'] = blocks_channels * 2 + self.attn = MODELS.build(attention_cfg) + else: + self.attn = None + + def build_blocks_layer(self, blocks_channels: int) -> nn.Module: + """Build blocks layer. + + Args: + blocks_channels: The channels of this Module. + """ + blocks = nn.Sequential() + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict(in_channels=blocks_channels, out_channels=blocks_channels)) + block_cfg.setdefault('norm_cfg', self.norm_cfg) + block_cfg.setdefault('act_cfg', self.act_cfg) + + for i in range(self.num_block): + blocks.add_module(str(i), MODELS.build(block_cfg)) + + if i == (self.num_block - 1) // 2 and self.use_spp: + blocks.add_module( + 'spp', + SPPFBottleneck( + blocks_channels, + blocks_channels, + kernel_sizes=[5, 9, 13], + use_conv_first=False, + conv_cfg=None, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + return blocks + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + if self.conv_down is not None: + x = self.conv_down(x) + y1 = self.conv1(x) + y2 = self.blocks(self.conv2(x)) + y = torch.cat([y1, y2], axis=1) + if self.attn is not None: + y = self.attn(y) + y = self.conv3(y) + return y + + +@MODELS.register_module() +class RepStageBlock(nn.Module): + """RepStageBlock is a stage block with rep-style basic block. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + num_blocks (int, tuple[int]): Number of blocks. Defaults to 1. + bottle_block (nn.Module): Basic unit of RepStage. + Defaults to RepVGGBlock. + block_cfg (ConfigType): Config of RepStage. + Defaults to 'RepVGGBlock'. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + num_blocks: int = 1, + bottle_block: nn.Module = RepVGGBlock, + block_cfg: ConfigType = dict(type='RepVGGBlock')): + super().__init__() + block_cfg = block_cfg.copy() + + block_cfg.update( + dict(in_channels=in_channels, out_channels=out_channels)) + + self.conv1 = MODELS.build(block_cfg) + + block_cfg.update( + dict(in_channels=out_channels, out_channels=out_channels)) + + self.block = None + if num_blocks > 1: + self.block = nn.Sequential(*(MODELS.build(block_cfg) + for _ in range(num_blocks - 1))) + + if bottle_block == BottleRep: + self.conv1 = BottleRep( + in_channels, + out_channels, + block_cfg=block_cfg, + adaptive_weight=True) + num_blocks = num_blocks // 2 + self.block = None + if num_blocks > 1: + self.block = nn.Sequential(*(BottleRep( + out_channels, + out_channels, + block_cfg=block_cfg, + adaptive_weight=True) for _ in range(num_blocks - 1))) + + def forward(self, x: Tensor) -> Tensor: + """Forward process. + + Args: + x (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + x = self.conv1(x) + if self.block is not None: + x = self.block(x) + return x + + +class DarknetBottleneck(MMDET_DarknetBottleneck): + """The basic bottleneck block used in Darknet. + + Each ResBlock consists of two ConvModules and the input is added to the + final output. Each ConvModule is composed of Conv, BN, and LeakyReLU. + The first convLayer has filter size of k1Xk1 and the second one has the + filter size of k2Xk2. + + Note: + This DarknetBottleneck is little different from MMDet's, we can + change the kernel size and padding for each conv. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (float): The kernel size for hidden channel. + Defaults to 0.5. + kernel_size (Sequence[int]): The kernel size of the convolution. + Defaults to (1, 3). + padding (Sequence[int]): The padding size of the convolution. + Defaults to (0, 1). + add_identity (bool): Whether to add identity to the out. + Defaults to True + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish'). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expansion: float = 0.5, + kernel_size: Sequence[int] = (1, 3), + padding: Sequence[int] = (0, 1), + add_identity: bool = True, + use_depthwise: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(in_channels, out_channels, init_cfg=init_cfg) + hidden_channels = int(out_channels * expansion) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + assert isinstance(kernel_size, Sequence) and len(kernel_size) == 2 + + self.conv1 = ConvModule( + in_channels, + hidden_channels, + kernel_size[0], + padding=padding[0], + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = conv( + hidden_channels, + out_channels, + kernel_size[1], + stride=1, + padding=padding[1], + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.add_identity = \ + add_identity and in_channels == out_channels + + +class CSPLayerWithTwoConv(BaseModule): + """Cross Stage Partial Layer with 2 convolutions. + + Args: + in_channels (int): The input channels of the CSP layer. + out_channels (int): The output channels of the CSP layer. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + num_blocks (int): Number of blocks. Defaults to 1 + add_identity (bool): Whether to add identity in blocks. + Defaults to True. + conv_cfg (dict, optional): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + expand_ratio: float = 0.5, + num_blocks: int = 1, + add_identity: bool = True, # shortcut + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + + self.mid_channels = int(out_channels * expand_ratio) + self.main_conv = ConvModule( + in_channels, + 2 * self.mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.final_conv = ConvModule( + (2 + num_blocks) * self.mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = nn.ModuleList( + DarknetBottleneck( + self.mid_channels, + self.mid_channels, + expansion=1, + kernel_size=(3, 3), + padding=(1, 1), + add_identity=add_identity, + use_depthwise=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) for _ in range(num_blocks)) + + def forward(self, x: Tensor) -> Tensor: + """Forward process.""" + x_main = self.main_conv(x) + x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1)) + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + return self.final_conv(torch.cat(x_main, 1)) + + +class BiFusion(nn.Module): + """BiFusion Block in YOLOv6. + + BiFusion fuses current-, high- and low-level features. + Compared with concatenation in PAN, it fuses an extra low-level feature. + + Args: + in_channels0 (int): The channels of current-level feature. + in_channels1 (int): The input channels of lower-level feature. + out_channels (int): The out channels of the BiFusion module. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels0: int, + in_channels1: int, + out_channels: int, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True)): + super().__init__() + self.conv1 = ConvModule( + in_channels0, + out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = ConvModule( + in_channels1, + out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + out_channels * 3, + out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.upsample = nn.ConvTranspose2d( + out_channels, out_channels, kernel_size=2, stride=2, bias=True) + self.downsample = ConvModule( + out_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: List[torch.Tensor]) -> Tensor: + """Forward process + Args: + x (List[torch.Tensor]): The tensor list of length 3. + x[0]: The high-level feature. + x[1]: The current-level feature. + x[2]: The low-level feature. + """ + x0 = self.upsample(x[0]) + x1 = self.conv1(x[1]) + x2 = self.downsample(self.conv2(x[2])) + return self.conv3(torch.cat((x0, x1, x2), dim=1)) + + +class CSPSPPFBottleneck(BaseModule): + """The SPPF block having a CSP-like version in YOLOv6 3.0. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 5. + use_conv_first (bool): Whether to use conv before pooling layer. + In YOLOv5 and YOLOX, the para set to True. + In PPYOLOE, the para set to False. + Defaults to True. + mid_channels_scale (float): Channel multiplier, multiply in_channels + by this amount to get mid_channels. This parameter is valid only + when use_conv_fist=True.Defaults to 0.5. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_sizes: Union[int, Sequence[int]] = 5, + use_conv_first: bool = True, + mid_channels_scale: float = 0.5, + conv_cfg: ConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + + if use_conv_first: + mid_channels = int(in_channels * mid_channels_scale) + self.conv1 = ConvModule( + in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + mid_channels, + mid_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv4 = ConvModule( + mid_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + mid_channels = in_channels + self.conv1 = None + self.conv3 = None + self.conv4 = None + + self.conv2 = ConvModule( + in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.kernel_sizes = kernel_sizes + + if isinstance(kernel_sizes, int): + self.poolings = nn.MaxPool2d( + kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2) + conv2_in_channels = mid_channels * 4 + else: + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_in_channels = mid_channels * (len(kernel_sizes) + 1) + + self.conv5 = ConvModule( + conv2_in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv6 = ConvModule( + mid_channels, + mid_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv7 = ConvModule( + mid_channels * 2, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x0 = self.conv4(self.conv3(self.conv1(x))) if self.conv1 else x + y = self.conv2(x) + + if isinstance(self.kernel_sizes, int): + x1 = self.poolings(x0) + x2 = self.poolings(x1) + x3 = torch.cat([x0, x1, x2, self.poolings(x2)], dim=1) + else: + x3 = torch.cat( + [x0] + [pooling(x0) for pooling in self.poolings], dim=1) + + x3 = self.conv6(self.conv5(x3)) + x = self.conv7(torch.cat([y, x3], dim=1)) + return x diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/losses/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c89fe4dc45ace2583241cff11542d1fbf8bdc73a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/losses/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .iou_loss import IoULoss, bbox_overlaps +from .oks_loss import OksLoss + +__all__ = ['IoULoss', 'bbox_overlaps', 'OksLoss'] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/losses/iou_loss.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/losses/iou_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..e3d3dc40ef3e678989db85ee8cfd0035a26a9f19 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/losses/iou_loss.py @@ -0,0 +1,232 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmdet.models.losses.utils import weight_reduce_loss +from mmdet.structures.bbox import HorizontalBoxes + +from mmyolo.registry import MODELS + + +def bbox_overlaps(pred: torch.Tensor, + target: torch.Tensor, + iou_mode: str = 'ciou', + bbox_format: str = 'xywh', + siou_theta: float = 4.0, + eps: float = 1e-7) -> torch.Tensor: + r"""Calculate overlap between two set of bboxes. + `Implementation of paper `Enhancing Geometric Factors into + Model Learning and Inference for Object Detection and Instance + Segmentation `_. + + In the CIoU implementation of YOLOv5 and MMDetection, there is a slight + difference in the way the alpha parameter is computed. + + mmdet version: + alpha = (ious > 0.5).float() * v / (1 - ious + v) + YOLOv5 version: + alpha = v / (v - ious + (1 + eps) + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2) + or (x, y, w, h),shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + iou_mode (str): Options are ('iou', 'ciou', 'giou', 'siou'). + Defaults to "ciou". + bbox_format (str): Options are "xywh" and "xyxy". + Defaults to "xywh". + siou_theta (float): siou_theta for SIoU when calculate shape cost. + Defaults to 4.0. + eps (float): Eps to avoid log(0). + + Returns: + Tensor: shape (n, ). + """ + assert iou_mode in ('iou', 'ciou', 'giou', 'siou') + assert bbox_format in ('xyxy', 'xywh') + if bbox_format == 'xywh': + pred = HorizontalBoxes.cxcywh_to_xyxy(pred) + target = HorizontalBoxes.cxcywh_to_xyxy(target) + + bbox1_x1, bbox1_y1 = pred[..., 0], pred[..., 1] + bbox1_x2, bbox1_y2 = pred[..., 2], pred[..., 3] + bbox2_x1, bbox2_y1 = target[..., 0], target[..., 1] + bbox2_x2, bbox2_y2 = target[..., 2], target[..., 3] + + # Overlap + overlap = (torch.min(bbox1_x2, bbox2_x2) - + torch.max(bbox1_x1, bbox2_x1)).clamp(0) * \ + (torch.min(bbox1_y2, bbox2_y2) - + torch.max(bbox1_y1, bbox2_y1)).clamp(0) + + # Union + w1, h1 = bbox1_x2 - bbox1_x1, bbox1_y2 - bbox1_y1 + w2, h2 = bbox2_x2 - bbox2_x1, bbox2_y2 - bbox2_y1 + union = (w1 * h1) + (w2 * h2) - overlap + eps + + h1 = bbox1_y2 - bbox1_y1 + eps + h2 = bbox2_y2 - bbox2_y1 + eps + + # IoU + ious = overlap / union + + # enclose area + enclose_x1y1 = torch.min(pred[..., :2], target[..., :2]) + enclose_x2y2 = torch.max(pred[..., 2:], target[..., 2:]) + enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0) + + enclose_w = enclose_wh[..., 0] # cw + enclose_h = enclose_wh[..., 1] # ch + + if iou_mode == 'ciou': + # CIoU = IoU - ( (ρ^2(b_pred,b_gt) / c^2) + (alpha x v) ) + + # calculate enclose area (c^2) + enclose_area = enclose_w**2 + enclose_h**2 + eps + + # calculate ρ^2(b_pred,b_gt): + # euclidean distance between b_pred(bbox2) and b_gt(bbox1) + # center point, because bbox format is xyxy -> left-top xy and + # right-bottom xy, so need to / 4 to get center point. + rho2_left_item = ((bbox2_x1 + bbox2_x2) - (bbox1_x1 + bbox1_x2))**2 / 4 + rho2_right_item = ((bbox2_y1 + bbox2_y2) - + (bbox1_y1 + bbox1_y2))**2 / 4 + rho2 = rho2_left_item + rho2_right_item # rho^2 (ρ^2) + + # Width and height ratio (v) + wh_ratio = (4 / (math.pi**2)) * torch.pow( + torch.atan(w2 / h2) - torch.atan(w1 / h1), 2) + + with torch.no_grad(): + alpha = wh_ratio / (wh_ratio - ious + (1 + eps)) + + # CIoU + ious = ious - ((rho2 / enclose_area) + (alpha * wh_ratio)) + + elif iou_mode == 'giou': + # GIoU = IoU - ( (A_c - union) / A_c ) + convex_area = enclose_w * enclose_h + eps # convex area (A_c) + ious = ious - (convex_area - union) / convex_area + + elif iou_mode == 'siou': + # SIoU: https://arxiv.org/pdf/2205.12740.pdf + # SIoU = IoU - ( (Distance Cost + Shape Cost) / 2 ) + + # calculate sigma (σ): + # euclidean distance between bbox2(pred) and bbox1(gt) center point, + # sigma_cw = b_cx_gt - b_cx + sigma_cw = (bbox2_x1 + bbox2_x2) / 2 - (bbox1_x1 + bbox1_x2) / 2 + eps + # sigma_ch = b_cy_gt - b_cy + sigma_ch = (bbox2_y1 + bbox2_y2) / 2 - (bbox1_y1 + bbox1_y2) / 2 + eps + # sigma = √( (sigma_cw ** 2) - (sigma_ch ** 2) ) + sigma = torch.pow(sigma_cw**2 + sigma_ch**2, 0.5) + + # choose minimize alpha, sin(alpha) + sin_alpha = torch.abs(sigma_ch) / sigma + sin_beta = torch.abs(sigma_cw) / sigma + sin_alpha = torch.where(sin_alpha <= math.sin(math.pi / 4), sin_alpha, + sin_beta) + + # Angle cost = 1 - 2 * ( sin^2 ( arcsin(x) - (pi / 4) ) ) + angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2) + + # Distance cost = Σ_(t=x,y) (1 - e ^ (- γ ρ_t)) + rho_x = (sigma_cw / enclose_w)**2 # ρ_x + rho_y = (sigma_ch / enclose_h)**2 # ρ_y + gamma = 2 - angle_cost # γ + distance_cost = (1 - torch.exp(-1 * gamma * rho_x)) + ( + 1 - torch.exp(-1 * gamma * rho_y)) + + # Shape cost = Ω = Σ_(t=w,h) ( ( 1 - ( e ^ (-ω_t) ) ) ^ θ ) + omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) # ω_w + omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) # ω_h + shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), + siou_theta) + torch.pow( + 1 - torch.exp(-1 * omiga_h), siou_theta) + + ious = ious - ((distance_cost + shape_cost) * 0.5) + + return ious.clamp(min=-1.0, max=1.0) + + +@MODELS.register_module() +class IoULoss(nn.Module): + """IoULoss. + + Computing the IoU loss between a set of predicted bboxes and target bboxes. + Args: + iou_mode (str): Options are "ciou". + Defaults to "ciou". + bbox_format (str): Options are "xywh" and "xyxy". + Defaults to "xywh". + eps (float): Eps to avoid log(0). + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + return_iou (bool): If True, return loss and iou. + """ + + def __init__(self, + iou_mode: str = 'ciou', + bbox_format: str = 'xywh', + eps: float = 1e-7, + reduction: str = 'mean', + loss_weight: float = 1.0, + return_iou: bool = True): + super().__init__() + assert bbox_format in ('xywh', 'xyxy') + assert iou_mode in ('ciou', 'siou', 'giou') + self.iou_mode = iou_mode + self.bbox_format = bbox_format + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + self.return_iou = return_iou + + def forward( + self, + pred: torch.Tensor, + target: torch.Tensor, + weight: Optional[torch.Tensor] = None, + avg_factor: Optional[float] = None, + reduction_override: Optional[Union[str, bool]] = None + ) -> Tuple[Union[torch.Tensor, torch.Tensor], torch.Tensor]: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2) + or (x, y, w, h),shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + weight (Tensor, optional): Element-wise weights. + avg_factor (float, optional): Average factor when computing the + mean of losses. + reduction_override (str, bool, optional): Same as built-in losses + of PyTorch. Defaults to None. + Returns: + loss or tuple(loss, iou): + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + + if weight is not None and weight.dim() > 1: + weight = weight.mean(-1) + + iou = bbox_overlaps( + pred, + target, + iou_mode=self.iou_mode, + bbox_format=self.bbox_format, + eps=self.eps) + loss = self.loss_weight * weight_reduce_loss(1.0 - iou, weight, + reduction, avg_factor) + + if self.return_iou: + return loss, iou + else: + return loss diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/losses/oks_loss.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/losses/oks_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..62c63422b3d13ade5164f23a9537a01847ff358d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/losses/oks_loss.py @@ -0,0 +1,91 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from mmyolo.registry import MODELS + +try: + from mmpose.datasets.datasets.utils import parse_pose_metainfo +except ImportError: + parse_pose_metainfo = None + + +@MODELS.register_module() +class OksLoss(nn.Module): + """A PyTorch implementation of the Object Keypoint Similarity (OKS) loss as + described in the paper "YOLO-Pose: Enhancing YOLO for Multi Person Pose + Estimation Using Object Keypoint Similarity Loss" by Debapriya et al. + + (2022). + The OKS loss is used for keypoint-based object recognition and consists + of a measure of the similarity between predicted and ground truth + keypoint locations, adjusted by the size of the object in the image. + The loss function takes as input the predicted keypoint locations, the + ground truth keypoint locations, a mask indicating which keypoints are + valid, and bounding boxes for the objects. + Args: + metainfo (Optional[str]): Path to a JSON file containing information + about the dataset's annotations. + loss_weight (float): Weight for the loss. + """ + + def __init__(self, + metainfo: Optional[str] = None, + loss_weight: float = 1.0): + super().__init__() + + if metainfo is not None: + if parse_pose_metainfo is None: + raise ImportError( + 'Please run "mim install -r requirements/mmpose.txt" ' + 'to install mmpose first for OksLossn.') + metainfo = parse_pose_metainfo(dict(from_file=metainfo)) + sigmas = metainfo.get('sigmas', None) + if sigmas is not None: + self.register_buffer('sigmas', torch.as_tensor(sigmas)) + self.loss_weight = loss_weight + + def forward(self, + output: Tensor, + target: Tensor, + target_weights: Tensor, + bboxes: Optional[Tensor] = None) -> Tensor: + oks = self.compute_oks(output, target, target_weights, bboxes) + loss = 1 - oks + return loss * self.loss_weight + + def compute_oks(self, + output: Tensor, + target: Tensor, + target_weights: Tensor, + bboxes: Optional[Tensor] = None) -> Tensor: + """Calculates the OKS loss. + + Args: + output (Tensor): Predicted keypoints in shape N x k x 2, where N + is batch size, k is the number of keypoints, and 2 are the + xy coordinates. + target (Tensor): Ground truth keypoints in the same shape as + output. + target_weights (Tensor): Mask of valid keypoints in shape N x k, + with 1 for valid and 0 for invalid. + bboxes (Optional[Tensor]): Bounding boxes in shape N x 4, + where 4 are the xyxy coordinates. + Returns: + Tensor: The calculated OKS loss. + """ + + dist = torch.norm(output - target, dim=-1) + + if hasattr(self, 'sigmas'): + sigmas = self.sigmas.reshape(*((1, ) * (dist.ndim - 1)), -1) + dist = dist / sigmas + if bboxes is not None: + area = torch.norm(bboxes[..., 2:] - bboxes[..., :2], dim=-1) + dist = dist / area.clip(min=1e-8).unsqueeze(-1) + + return (torch.exp(-dist.pow(2) / 2) * target_weights).sum( + dim=-1) / target_weights.sum(dim=-1).clip(min=1e-8) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..159fae8d6e248330e49919420bf82154d905ad6c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_yolo_neck import BaseYOLONeck +from .cspnext_pafpn import CSPNeXtPAFPN +from .ppyoloe_csppan import PPYOLOECSPPAFPN +from .yolov5_pafpn import YOLOv5PAFPN +from .yolov6_pafpn import (YOLOv6CSPRepBiPAFPN, YOLOv6CSPRepPAFPN, + YOLOv6RepBiPAFPN, YOLOv6RepPAFPN) +from .yolov7_pafpn import YOLOv7PAFPN +from .yolov8_pafpn import YOLOv8PAFPN +from .yolox_pafpn import YOLOXPAFPN + +__all__ = [ + 'YOLOv5PAFPN', 'BaseYOLONeck', 'YOLOv6RepPAFPN', 'YOLOXPAFPN', + 'CSPNeXtPAFPN', 'YOLOv7PAFPN', 'PPYOLOECSPPAFPN', 'YOLOv6CSPRepPAFPN', + 'YOLOv8PAFPN', 'YOLOv6RepBiPAFPN', 'YOLOv6CSPRepBiPAFPN' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/base_yolo_neck.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/base_yolo_neck.py new file mode 100644 index 0000000000000000000000000000000000000000..8825b7634f54df624f56d0cd0beef4d0e4658788 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/base_yolo_neck.py @@ -0,0 +1,261 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import List, Union + +import torch +import torch.nn as nn +from mmdet.utils import ConfigType, OptMultiConfig +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class BaseYOLONeck(BaseModule, metaclass=ABCMeta): + """Base neck used in YOLO series. + + .. code:: text + + P5 neck model structure diagram + +--------+ +-------+ + |top_down|----------+--------->| out |---> output0 + | layer1 | | | layer0| + +--------+ | +-------+ + stride=8 ^ | + idx=0 +------+ +--------+ | + -----> |reduce|--->| cat | | + |layer0| +--------+ | + +------+ ^ v + +--------+ +-----------+ + |upsample| |downsample | + | layer1 | | layer0 | + +--------+ +-----------+ + ^ | + +--------+ v + |top_down| +-----------+ + | layer2 |--->| cat | + +--------+ +-----------+ + stride=16 ^ v + idx=1 +------+ +--------+ +-----------+ +-------+ + -----> |reduce|--->| cat | | bottom_up |--->| out |---> output1 + |layer1| +--------+ | layer0 | | layer1| + +------+ ^ +-----------+ +-------+ + | v + +--------+ +-----------+ + |upsample| |downsample | + | layer2 | | layer1 | + stride=32 +--------+ +-----------+ + idx=2 +------+ ^ v + -----> |reduce| | +-----------+ + |layer2|---------+------->| cat | + +------+ +-----------+ + v + +-----------+ +-------+ + | bottom_up |--->| out |---> output2 + | layer1 | | layer2| + +-----------+ +-------+ + + .. code:: text + + P6 neck model structure diagram + +--------+ +-------+ + |top_down|----------+--------->| out |---> output0 + | layer1 | | | layer0| + +--------+ | +-------+ + stride=8 ^ | + idx=0 +------+ +--------+ | + -----> |reduce|--->| cat | | + |layer0| +--------+ | + +------+ ^ v + +--------+ +-----------+ + |upsample| |downsample | + | layer1 | | layer0 | + +--------+ +-----------+ + ^ | + +--------+ v + |top_down| +-----------+ + | layer2 |--->| cat | + +--------+ +-----------+ + stride=16 ^ v + idx=1 +------+ +--------+ +-----------+ +-------+ + -----> |reduce|--->| cat | | bottom_up |--->| out |---> output1 + |layer1| +--------+ | layer0 | | layer1| + +------+ ^ +-----------+ +-------+ + | v + +--------+ +-----------+ + |upsample| |downsample | + | layer2 | | layer1 | + +--------+ +-----------+ + ^ | + +--------+ v + |top_down| +-----------+ + | layer3 |--->| cat | + +--------+ +-----------+ + stride=32 ^ v + idx=2 +------+ +--------+ +-----------+ +-------+ + -----> |reduce|--->| cat | | bottom_up |--->| out |---> output2 + |layer2| +--------+ | layer1 | | layer2| + +------+ ^ +-----------+ +-------+ + | v + +--------+ +-----------+ + |upsample| |downsample | + | layer3 | | layer2 | + +--------+ +-----------+ + stride=64 ^ v + idx=3 +------+ | +-----------+ + -----> |reduce|---------+------->| cat | + |layer3| +-----------+ + +------+ v + +-----------+ +-------+ + | bottom_up |--->| out |---> output3 + | layer2 | | layer3| + +-----------+ +-------+ + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + upsample_feats_cat_first (bool): Whether the output features are + concat first after upsampling in the topdown module. + Defaults to True. Currently only YOLOv7 is false. + freeze_all(bool): Whether to freeze the model. Defaults to False + norm_cfg (dict): Config dict for normalization layer. + Defaults to None. + act_cfg (dict): Config dict for activation layer. + Defaults to None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: Union[int, List[int]], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + upsample_feats_cat_first: bool = True, + freeze_all: bool = False, + norm_cfg: ConfigType = None, + act_cfg: ConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs): + super().__init__(init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + self.deepen_factor = deepen_factor + self.widen_factor = widen_factor + self.upsample_feats_cat_first = upsample_feats_cat_first + self.freeze_all = freeze_all + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + self.reduce_layers = nn.ModuleList() + for idx in range(len(in_channels)): + self.reduce_layers.append(self.build_reduce_layer(idx)) + + # build top-down blocks + self.upsample_layers = nn.ModuleList() + self.top_down_layers = nn.ModuleList() + for idx in range(len(in_channels) - 1, 0, -1): + self.upsample_layers.append(self.build_upsample_layer(idx)) + self.top_down_layers.append(self.build_top_down_layer(idx)) + + # build bottom-up blocks + self.downsample_layers = nn.ModuleList() + self.bottom_up_layers = nn.ModuleList() + for idx in range(len(in_channels) - 1): + self.downsample_layers.append(self.build_downsample_layer(idx)) + self.bottom_up_layers.append(self.build_bottom_up_layer(idx)) + + self.out_layers = nn.ModuleList() + for idx in range(len(in_channels)): + self.out_layers.append(self.build_out_layer(idx)) + + @abstractmethod + def build_reduce_layer(self, idx: int): + """build reduce layer.""" + pass + + @abstractmethod + def build_upsample_layer(self, idx: int): + """build upsample layer.""" + pass + + @abstractmethod + def build_top_down_layer(self, idx: int): + """build top down layer.""" + pass + + @abstractmethod + def build_downsample_layer(self, idx: int): + """build downsample layer.""" + pass + + @abstractmethod + def build_bottom_up_layer(self, idx: int): + """build bottom up layer.""" + pass + + @abstractmethod + def build_out_layer(self, idx: int): + """build out layer.""" + pass + + def _freeze_all(self): + """Freeze the model.""" + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + """Convert the model into training mode while keep the normalization + layer freezed.""" + super().train(mode) + if self.freeze_all: + self._freeze_all() + + def forward(self, inputs: List[torch.Tensor]) -> tuple: + """Forward function.""" + assert len(inputs) == len(self.in_channels) + # reduce layers + reduce_outs = [] + for idx in range(len(self.in_channels)): + reduce_outs.append(self.reduce_layers[idx](inputs[idx])) + + # top-down path + inner_outs = [reduce_outs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_low = reduce_outs[idx - 1] + upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - + idx]( + feat_high) + if self.upsample_feats_cat_first: + top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1) + else: + top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1) + inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx]( + top_down_layer_inputs) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_layers[idx](feat_low) + out = self.bottom_up_layers[idx]( + torch.cat([downsample_feat, feat_high], 1)) + outs.append(out) + + # out_layers + results = [] + for idx in range(len(self.in_channels)): + results.append(self.out_layers[idx](outs[idx])) + + return tuple(results) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/cspnext_pafpn.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/cspnext_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..310126f63e12f888daac50ca30674484f7b3a6ec --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/cspnext_pafpn.py @@ -0,0 +1,201 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Sequence + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class CSPNeXtPAFPN(BaseYOLONeck): + """Path Aggregation Network with CSPNeXt blocks. + + Args: + in_channels (Sequence[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. + Defaults to 3. + use_depthwise (bool): Whether to use depthwise separable convolution in + blocks. Defaults to False. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + upsample_cfg (dict): Config dict for interpolate layer. + Default: `dict(scale_factor=2, mode='nearest')` + conv_cfg (dict, optional): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN') + act_cfg (dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True) + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__( + self, + in_channels: Sequence[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + use_depthwise: bool = False, + expand_ratio: float = 0.5, + upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'), + conv_cfg: bool = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + self.num_csp_blocks = round(num_csp_blocks * deepen_factor) + self.conv = DepthwiseSeparableConvModule \ + if use_depthwise else ConvModule + self.upsample_cfg = upsample_cfg + self.expand_ratio = expand_ratio + self.conv_cfg = conv_cfg + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=int(out_channels * widen_factor), + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + layer = self.conv( + self.in_channels[idx], + self.in_channels[idx - 1], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, *args, **kwargs) -> nn.Module: + """build upsample layer.""" + return nn.Upsample(**self.upsample_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + if idx == 1: + return CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return nn.Sequential( + CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + self.conv( + self.in_channels[idx - 1], + self.in_channels[idx - 2], + kernel_size=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + return self.conv( + self.in_channels[idx], + self.in_channels[idx], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayer( + self.in_channels[idx] * 2, + self.in_channels[idx + 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_out_layer(self, idx: int) -> nn.Module: + """build out layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The out layer. + """ + return self.conv( + self.in_channels[idx], + self.out_channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/ppyoloe_csppan.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/ppyoloe_csppan.py new file mode 100644 index 0000000000000000000000000000000000000000..4e4ef7200bfc6784a7ce8d92bcfbc46314e518e9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/ppyoloe_csppan.py @@ -0,0 +1,216 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.models.backbones.csp_resnet import CSPResLayer +from mmyolo.models.necks import BaseYOLONeck +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class PPYOLOECSPPAFPN(BaseYOLONeck): + """CSPPAN in PPYOLOE. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (List[int]): Number of output channels + (used at each scale). + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + freeze_all(bool): Whether to freeze the model. + num_csplayer (int): Number of `CSPResLayer` in per layer. + Defaults to 1. + num_blocks_per_layer (int): Number of blocks per `CSPResLayer`. + Defaults to 3. + block_cfg (dict): Config dict for block. Defaults to + dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=False) + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + drop_block_cfg (dict, optional): Drop block config. + Defaults to None. If you want to use Drop block after + `CSPResLayer`, you can set this para as + dict(type='mmdet.DropBlock', drop_prob=0.1, + block_size=3, warm_iters=0). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + use_spp (bool): Whether to use `SPP` in reduce layer. + Defaults to False. + """ + + def __init__(self, + in_channels: List[int] = [256, 512, 1024], + out_channels: List[int] = [256, 512, 1024], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + freeze_all: bool = False, + num_csplayer: int = 1, + num_blocks_per_layer: int = 3, + block_cfg: ConfigType = dict( + type='PPYOLOEBasicBlock', shortcut=False, + use_alpha=False), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + drop_block_cfg: ConfigType = None, + init_cfg: OptMultiConfig = None, + use_spp: bool = False): + self.block_cfg = block_cfg + self.num_csplayer = num_csplayer + self.num_blocks_per_layer = round(num_blocks_per_layer * deepen_factor) + # Only use spp in last reduce_layer, if use_spp=True. + self.use_spp = use_spp + self.drop_block_cfg = drop_block_cfg + assert drop_block_cfg is None or isinstance(drop_block_cfg, dict) + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=[ + int(channel * widen_factor) for channel in out_channels + ], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int): + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + # fpn_stage + in_channels = self.in_channels[idx] + out_channels = self.out_channels[idx] + + layer = [ + CSPResLayer( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + num_block=self.num_blocks_per_layer, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=None, + use_spp=self.use_spp) for i in range(self.num_csplayer) + ] + + if self.drop_block_cfg: + layer.append(MODELS.build(self.drop_block_cfg)) + layer = nn.Sequential(*layer) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer.""" + # fpn_route + in_channels = self.out_channels[idx] + return nn.Sequential( + ConvModule( + in_channels=in_channels, + out_channels=in_channels // 2, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Upsample(scale_factor=2, mode='nearest')) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + # fpn_stage + in_channels = self.in_channels[idx - 1] + self.out_channels[idx] // 2 + out_channels = self.out_channels[idx - 1] + + layer = [ + CSPResLayer( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + num_block=self.num_blocks_per_layer, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=None, + use_spp=False) for i in range(self.num_csplayer) + ] + + if self.drop_block_cfg: + layer.append(MODELS.build(self.drop_block_cfg)) + + return nn.Sequential(*layer) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + # pan_route + return ConvModule( + in_channels=self.out_channels[idx], + out_channels=self.out_channels[idx], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + # pan_stage + in_channels = self.out_channels[idx + 1] + self.out_channels[idx] + out_channels = self.out_channels[idx + 1] + + layer = [ + CSPResLayer( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + num_block=self.num_blocks_per_layer, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=None, + use_spp=False) for i in range(self.num_csplayer) + ] + + if self.drop_block_cfg: + layer.append(MODELS.build(self.drop_block_cfg)) + + return nn.Sequential(*layer) + + def build_out_layer(self, *args, **kwargs) -> nn.Module: + """build out layer.""" + return nn.Identity() diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov5_pafpn.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov5_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..b95147fc512359442aeb1bbc88aadd07031bdadf --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov5_pafpn.py @@ -0,0 +1,171 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..utils import make_divisible, make_round +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOv5PAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOv5. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 1, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + self.num_csp_blocks = num_csp_blocks + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def init_weights(self): + if self.init_cfg is None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + layer = ConvModule( + make_divisible(self.in_channels[idx], self.widen_factor), + make_divisible(self.in_channels[idx - 1], self.widen_factor), + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, *args, **kwargs) -> nn.Module: + """build upsample layer.""" + return nn.Upsample(scale_factor=2, mode='nearest') + + def build_top_down_layer(self, idx: int): + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + + if idx == 1: + return CSPLayer( + make_divisible(self.in_channels[idx - 1] * 2, + self.widen_factor), + make_divisible(self.in_channels[idx - 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return nn.Sequential( + CSPLayer( + make_divisible(self.in_channels[idx - 1] * 2, + self.widen_factor), + make_divisible(self.in_channels[idx - 1], + self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, + self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + make_divisible(self.in_channels[idx - 1], + self.widen_factor), + make_divisible(self.in_channels[idx - 2], + self.widen_factor), + kernel_size=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + return ConvModule( + make_divisible(self.in_channels[idx], self.widen_factor), + make_divisible(self.in_channels[idx], self.widen_factor), + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayer( + make_divisible(self.in_channels[idx] * 2, self.widen_factor), + make_divisible(self.in_channels[idx + 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_out_layer(self, *args, **kwargs) -> nn.Module: + """build out layer.""" + return nn.Identity() diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov6_pafpn.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov6_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..87782712352e269f159cc56da6ba6715840c87c7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov6_pafpn.py @@ -0,0 +1,527 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import BepC3StageBlock, BiFusion, RepStageBlock +from ..utils import make_round +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOv6RepPAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOv6. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.num_csp_blocks = num_csp_blocks + self.block_cfg = block_cfg + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The reduce layer. + """ + if idx == 2: + layer = ConvModule( + in_channels=int(self.in_channels[idx] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The upsample layer. + """ + return nn.ConvTranspose2d( + in_channels=int(self.out_channels[idx - 1] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + kernel_size=2, + stride=2, + bias=True) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = RepStageBlock( + in_channels=int( + (self.out_channels[idx - 1] + self.in_channels[idx - 1]) * + self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The downsample layer. + """ + return ConvModule( + in_channels=int(self.out_channels[idx] * self.widen_factor), + out_channels=int(self.out_channels[idx] * self.widen_factor), + kernel_size=3, + stride=2, + padding=3 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + + return RepStageBlock( + in_channels=int(self.out_channels[idx] * 2 * self.widen_factor), + out_channels=int(self.out_channels[idx + 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg) + + def build_out_layer(self, *args, **kwargs) -> nn.Module: + """build out layer.""" + return nn.Identity() + + def init_weights(self): + if self.init_cfg is None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOv6CSPRepPAFPN(YOLOv6RepPAFPN): + """Path Aggregation Network used in YOLOv6. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + block_act_cfg (dict): Config dict for activation layer used in each + stage. Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + hidden_ratio: float = 0.5, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_act_cfg: ConfigType = dict(type='SiLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.hidden_ratio = hidden_ratio + self.block_act_cfg = block_act_cfg + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = BepC3StageBlock( + in_channels=int( + (self.out_channels[idx - 1] + self.in_channels[idx - 1]) * + self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + + return BepC3StageBlock( + in_channels=int(self.out_channels[idx] * 2 * self.widen_factor), + out_channels=int(self.out_channels[idx + 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) + + +@MODELS.register_module() +class YOLOv6RepBiPAFPN(YOLOv6RepPAFPN): + """Path Aggregation Network used in YOLOv6 3.0. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.extra_in_channel = in_channels[0] + super().__init__( + in_channels=in_channels[1:], + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = RepStageBlock( + in_channels=int(self.out_channels[idx - 1] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The upsample layer. + """ + in_channels1 = self.in_channels[ + idx - 2] if idx > 1 else self.extra_in_channel + return BiFusion( + in_channels0=int(self.in_channels[idx - 1] * self.widen_factor), + in_channels1=int(in_channels1 * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs: List[torch.Tensor]) -> tuple: + """Forward function.""" + assert len(inputs) == len(self.in_channels) + 1 + # reduce layers + reduce_outs = [inputs[0]] + for idx in range(len(self.in_channels)): + reduce_outs.append(self.reduce_layers[idx](inputs[idx + 1])) + + # top-down path + inner_outs = [reduce_outs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_cur = reduce_outs[idx] + feat_low = reduce_outs[idx - 1] + top_down_layer_inputs = self.upsample_layers[len(self.in_channels) + - 1 - idx]([ + feat_high, + feat_cur, feat_low + ]) + inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx]( + top_down_layer_inputs) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_layers[idx](feat_low) + out = self.bottom_up_layers[idx]( + torch.cat([downsample_feat, feat_high], 1)) + outs.append(out) + + # out_layers + results = [] + for idx in range(len(self.in_channels)): + results.append(self.out_layers[idx](outs[idx])) + + return tuple(results) + + +@MODELS.register_module() +class YOLOv6CSPRepBiPAFPN(YOLOv6RepBiPAFPN): + """Path Aggregation Network used in YOLOv6 3.0. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + block_act_cfg (dict): Config dict for activation layer used in each + stage. Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + hidden_ratio: float = 0.5, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_act_cfg: ConfigType = dict(type='SiLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.hidden_ratio = hidden_ratio + self.block_act_cfg = block_act_cfg + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = BepC3StageBlock( + in_channels=int(self.out_channels[idx - 1] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + + return BepC3StageBlock( + in_channels=int(self.out_channels[idx] * 2 * self.widen_factor), + out_channels=int(self.out_channels[idx + 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov7_pafpn.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov7_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..1d31f4623b50083ff820e6b20229b33ad0f41860 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov7_pafpn.py @@ -0,0 +1,216 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import MaxPoolAndStrideConvBlock, RepVGGBlock, SPPFCSPBlock +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOv7PAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOv7. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + block_cfg (dict): Config dict for block. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + spp_expand_ratio (float): Expand ratio of SPPCSPBlock. + Defaults to 0.5. + is_tiny_version (bool): Is tiny version of neck. If True, + it means it is a yolov7 tiny model. Defaults to False. + use_maxpool_in_downsample (bool): Whether maxpooling is + used in downsample layers. Defaults to True. + use_in_channels_in_downsample (bool): MaxPoolAndStrideConvBlock + module input parameters. Defaults to False. + use_repconv_outs (bool): Whether to use `repconv` in the output + layer. Defaults to True. + upsample_feats_cat_first (bool): Whether the output features are + concat first after upsampling in the topdown module. + Defaults to True. Currently only YOLOv7 is false. + freeze_all(bool): Whether to freeze the model. Defaults to False. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: List[int], + block_cfg: dict = dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.25, + num_blocks=4, + num_convs_in_block=1), + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + spp_expand_ratio: float = 0.5, + is_tiny_version: bool = False, + use_maxpool_in_downsample: bool = True, + use_in_channels_in_downsample: bool = False, + use_repconv_outs: bool = True, + upsample_feats_cat_first: bool = False, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + + self.is_tiny_version = is_tiny_version + self.use_maxpool_in_downsample = use_maxpool_in_downsample + self.use_in_channels_in_downsample = use_in_channels_in_downsample + self.spp_expand_ratio = spp_expand_ratio + self.use_repconv_outs = use_repconv_outs + self.block_cfg = block_cfg + self.block_cfg.setdefault('norm_cfg', norm_cfg) + self.block_cfg.setdefault('act_cfg', act_cfg) + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=[ + int(channel * widen_factor) for channel in out_channels + ], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + upsample_feats_cat_first=upsample_feats_cat_first, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + layer = SPPFCSPBlock( + self.in_channels[idx], + self.out_channels[idx], + expand_ratio=self.spp_expand_ratio, + is_tiny_version=self.is_tiny_version, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = ConvModule( + self.in_channels[idx], + self.out_channels[idx], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + return layer + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer.""" + return nn.Sequential( + ConvModule( + self.out_channels[idx], + self.out_channels[idx - 1], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Upsample(scale_factor=2, mode='nearest')) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + block_cfg['in_channels'] = self.out_channels[idx - 1] * 2 + block_cfg['out_channels'] = self.out_channels[idx - 1] + return MODELS.build(block_cfg) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + if self.use_maxpool_in_downsample and not self.is_tiny_version: + return MaxPoolAndStrideConvBlock( + self.out_channels[idx], + self.out_channels[idx + 1], + use_in_channels_of_middle=self.use_in_channels_in_downsample, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return ConvModule( + self.out_channels[idx], + self.out_channels[idx + 1], + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + block_cfg['in_channels'] = self.out_channels[idx + 1] * 2 + block_cfg['out_channels'] = self.out_channels[idx + 1] + return MODELS.build(block_cfg) + + def build_out_layer(self, idx: int) -> nn.Module: + """build out layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The out layer. + """ + if len(self.in_channels) == 4: + # P6 + return nn.Identity() + + out_channels = self.out_channels[idx] * 2 + + if self.use_repconv_outs: + return RepVGGBlock( + self.out_channels[idx], + out_channels, + 3, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return ConvModule( + self.out_channels[idx], + out_channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov8_pafpn.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov8_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..e26698bcc191b0141d89c1e965de811494a96539 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolov8_pafpn.py @@ -0,0 +1,102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch.nn as nn +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from .. import CSPLayerWithTwoConv +from ..utils import make_divisible, make_round +from .yolov5_pafpn import YOLOv5PAFPN + + +@MODELS.register_module() +class YOLOv8PAFPN(YOLOv5PAFPN): + """Path Aggregation Network used in YOLOv8. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + return nn.Identity() + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + return CSPLayerWithTwoConv( + make_divisible((self.in_channels[idx - 1] + self.in_channels[idx]), + self.widen_factor), + make_divisible(self.out_channels[idx - 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayerWithTwoConv( + make_divisible( + (self.out_channels[idx] + self.out_channels[idx + 1]), + self.widen_factor), + make_divisible(self.out_channels[idx + 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolox_pafpn.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolox_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..bd2595e70fe47e38e68ebd0d878deb6f264bf2d1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/necks/yolox_pafpn.py @@ -0,0 +1,172 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOXPAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOX. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + freeze_all(bool): Whether to freeze the model. Defaults to False. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + use_depthwise: bool = False, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + self.num_csp_blocks = round(num_csp_blocks * deepen_factor) + self.use_depthwise = use_depthwise + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=int(out_channels * widen_factor), + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == 2: + layer = ConvModule( + self.in_channels[idx], + self.in_channels[idx - 1], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, *args, **kwargs) -> nn.Module: + """build upsample layer.""" + return nn.Upsample(scale_factor=2, mode='nearest') + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + if idx == 1: + return CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + elif idx == 2: + return nn.Sequential( + CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + self.in_channels[idx - 1], + self.in_channels[idx - 2], + kernel_size=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + return conv( + self.in_channels[idx], + self.in_channels[idx], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayer( + self.in_channels[idx] * 2, + self.in_channels[idx + 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_out_layer(self, idx: int) -> nn.Module: + """build out layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The out layer. + """ + return ConvModule( + self.in_channels[idx], + self.out_channels, + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/plugins/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..497233ac21a4dd1a6a2a3127c09435d8146eb553 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/plugins/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .cbam import CBAM + +__all__ = ['CBAM'] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/plugins/cbam.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/plugins/cbam.py new file mode 100644 index 0000000000000000000000000000000000000000..e9559f2e2db951a5681ec9af5864928ed480361b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/plugins/cbam.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import OptMultiConfig +from mmengine.model import BaseModule + +from mmyolo.registry import MODELS + + +class ChannelAttention(BaseModule): + """ChannelAttention. + + Args: + channels (int): The input (and output) channels of the + ChannelAttention. + reduce_ratio (int): Squeeze ratio in ChannelAttention, the intermediate + channel will be ``int(channels/ratio)``. Defaults to 16. + act_cfg (dict): Config dict for activation layer + Defaults to dict(type='ReLU'). + """ + + def __init__(self, + channels: int, + reduce_ratio: int = 16, + act_cfg: dict = dict(type='ReLU')): + super().__init__() + + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + + self.fc = nn.Sequential( + ConvModule( + in_channels=channels, + out_channels=int(channels / reduce_ratio), + kernel_size=1, + stride=1, + conv_cfg=None, + act_cfg=act_cfg), + ConvModule( + in_channels=int(channels / reduce_ratio), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=None, + act_cfg=None)) + self.sigmoid = nn.Sigmoid() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + avgpool_out = self.fc(self.avg_pool(x)) + maxpool_out = self.fc(self.max_pool(x)) + out = self.sigmoid(avgpool_out + maxpool_out) + return out + + +class SpatialAttention(BaseModule): + """SpatialAttention + Args: + kernel_size (int): The size of the convolution kernel in + SpatialAttention. Defaults to 7. + """ + + def __init__(self, kernel_size: int = 7): + super().__init__() + + self.conv = ConvModule( + in_channels=2, + out_channels=1, + kernel_size=kernel_size, + stride=1, + padding=kernel_size // 2, + conv_cfg=None, + act_cfg=dict(type='Sigmoid')) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + avg_out = torch.mean(x, dim=1, keepdim=True) + max_out, _ = torch.max(x, dim=1, keepdim=True) + out = torch.cat([avg_out, max_out], dim=1) + out = self.conv(out) + return out + + +@MODELS.register_module() +class CBAM(BaseModule): + """Convolutional Block Attention Module. arxiv link: + https://arxiv.org/abs/1807.06521v2. + + Args: + in_channels (int): The input (and output) channels of the CBAM. + reduce_ratio (int): Squeeze ratio in ChannelAttention, the intermediate + channel will be ``int(channels/ratio)``. Defaults to 16. + kernel_size (int): The size of the convolution kernel in + SpatialAttention. Defaults to 7. + act_cfg (dict): Config dict for activation layer in ChannelAttention + Defaults to dict(type='ReLU'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + reduce_ratio: int = 16, + kernel_size: int = 7, + act_cfg: dict = dict(type='ReLU'), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + self.channel_attention = ChannelAttention( + channels=in_channels, reduce_ratio=reduce_ratio, act_cfg=act_cfg) + + self.spatial_attention = SpatialAttention(kernel_size) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + out = self.channel_attention(x) * x + out = self.spatial_attention(out) * out + return out diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7dbdc25fa3cf16e85e0e99e7d302a98f2b4f13ce --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .assigners import BatchATSSAssigner, BatchTaskAlignedAssigner +from .coders import YOLOv5BBoxCoder, YOLOXBBoxCoder + +__all__ = [ + 'YOLOv5BBoxCoder', 'YOLOXBBoxCoder', 'BatchATSSAssigner', + 'BatchTaskAlignedAssigner' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7b2e2e69c921367083e21abce799e3ef5b8d47e1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .batch_atss_assigner import BatchATSSAssigner +from .batch_dsl_assigner import BatchDynamicSoftLabelAssigner +from .batch_task_aligned_assigner import BatchTaskAlignedAssigner +from .pose_sim_ota_assigner import PoseSimOTAAssigner +from .utils import (select_candidates_in_gts, select_highest_overlaps, + yolov6_iou_calculator) + +__all__ = [ + 'BatchATSSAssigner', 'BatchTaskAlignedAssigner', + 'select_candidates_in_gts', 'select_highest_overlaps', + 'yolov6_iou_calculator', 'BatchDynamicSoftLabelAssigner', + 'PoseSimOTAAssigner' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_atss_assigner.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_atss_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..45b3069afde73e240890273c58e3860da59ad854 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_atss_assigner.py @@ -0,0 +1,339 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.utils import ConfigType +from torch import Tensor + +from mmyolo.registry import TASK_UTILS +from .utils import (select_candidates_in_gts, select_highest_overlaps, + yolov6_iou_calculator) + + +def bbox_center_distance(bboxes: Tensor, + priors: Tensor) -> Tuple[Tensor, Tensor]: + """Compute the center distance between bboxes and priors. + + Args: + bboxes (Tensor): Shape (n, 4) for bbox, "xyxy" format. + priors (Tensor): Shape (num_priors, 4) for priors, "xyxy" format. + + Returns: + distances (Tensor): Center distances between bboxes and priors, + shape (num_priors, n). + priors_points (Tensor): Priors cx cy points, + shape (num_priors, 2). + """ + bbox_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0 + bbox_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0 + bbox_points = torch.stack((bbox_cx, bbox_cy), dim=1) + + priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0 + priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0 + priors_points = torch.stack((priors_cx, priors_cy), dim=1) + + distances = (bbox_points[:, None, :] - + priors_points[None, :, :]).pow(2).sum(-1).sqrt() + + return distances, priors_points + + +@TASK_UTILS.register_module() +class BatchATSSAssigner(nn.Module): + """Assign a batch of corresponding gt bboxes or background to each prior. + + This code is based on + https://github.com/meituan/YOLOv6/blob/main/yolov6/assigners/atss_assigner.py + + Each proposal will be assigned with `0` or a positive integer + indicating the ground truth index. + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + num_classes (int): number of class + iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou + calculator. Defaults to ``dict(type='BboxOverlaps2D')`` + topk (int): number of priors selected in each level + """ + + def __init__( + self, + num_classes: int, + iou_calculator: ConfigType = dict(type='mmdet.BboxOverlaps2D'), + topk: int = 9): + super().__init__() + self.num_classes = num_classes + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.topk = topk + + @torch.no_grad() + def forward(self, pred_bboxes: Tensor, priors: Tensor, + num_level_priors: List, gt_labels: Tensor, gt_bboxes: Tensor, + pad_bbox_flag: Tensor) -> dict: + """Assign gt to priors. + + The assignment is done in following steps + + 1. compute iou between all prior (prior of all pyramid levels) and gt + 2. compute center distance between all prior and gt + 3. on each pyramid level, for each gt, select k prior whose center + are closest to the gt center, so we total select k*l prior as + candidates for each gt + 4. get corresponding iou for the these candidates, and compute the + mean and std, set mean + std as the iou threshold + 5. select these candidates whose iou are greater than or equal to + the threshold as positive + 6. limit the positive sample's center in gt + + Args: + pred_bboxes (Tensor): Predicted bounding boxes, + shape(batch_size, num_priors, 4) + priors (Tensor): Model priors with stride, shape(num_priors, 4) + num_level_priors (List): Number of bboxes in each level, len(3) + gt_labels (Tensor): Ground truth label, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground truth bbox, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + Returns: + assigned_result (dict): Assigned result + 'assigned_labels' (Tensor): shape(batch_size, num_gt) + 'assigned_bboxes' (Tensor): shape(batch_size, num_gt, 4) + 'assigned_scores' (Tensor): + shape(batch_size, num_gt, number_classes) + 'fg_mask_pre_prior' (Tensor): shape(bs, num_gt) + """ + # generate priors + cell_half_size = priors[:, 2:] * 2.5 + priors_gen = torch.zeros_like(priors) + priors_gen[:, :2] = priors[:, :2] - cell_half_size + priors_gen[:, 2:] = priors[:, :2] + cell_half_size + priors = priors_gen + + batch_size = gt_bboxes.size(0) + num_gt, num_priors = gt_bboxes.size(1), priors.size(0) + + assigned_result = { + 'assigned_labels': + gt_bboxes.new_full([batch_size, num_priors], self.num_classes), + 'assigned_bboxes': + gt_bboxes.new_full([batch_size, num_priors, 4], 0), + 'assigned_scores': + gt_bboxes.new_full([batch_size, num_priors, self.num_classes], 0), + 'fg_mask_pre_prior': + gt_bboxes.new_full([batch_size, num_priors], 0) + } + + if num_gt == 0: + return assigned_result + + # compute iou between all prior (prior of all pyramid levels) and gt + overlaps = self.iou_calculator(gt_bboxes.reshape([-1, 4]), priors) + overlaps = overlaps.reshape([batch_size, -1, num_priors]) + + # compute center distance between all prior and gt + distances, priors_points = bbox_center_distance( + gt_bboxes.reshape([-1, 4]), priors) + distances = distances.reshape([batch_size, -1, num_priors]) + + # Selecting candidates based on the center distance + is_in_candidate, candidate_idxs = self.select_topk_candidates( + distances, num_level_priors, pad_bbox_flag) + + # get corresponding iou for the these candidates, and compute the + # mean and std, set mean + std as the iou threshold + overlaps_thr_per_gt, iou_candidates = self.threshold_calculator( + is_in_candidate, candidate_idxs, overlaps, num_priors, batch_size, + num_gt) + + # select candidates iou >= threshold as positive + is_pos = torch.where( + iou_candidates > overlaps_thr_per_gt.repeat([1, 1, num_priors]), + is_in_candidate, torch.zeros_like(is_in_candidate)) + + is_in_gts = select_candidates_in_gts(priors_points, gt_bboxes) + pos_mask = is_pos * is_in_gts * pad_bbox_flag + + # if an anchor box is assigned to multiple gts, + # the one with the highest IoU will be selected. + gt_idx_pre_prior, fg_mask_pre_prior, pos_mask = \ + select_highest_overlaps(pos_mask, overlaps, num_gt) + + # assigned target + assigned_labels, assigned_bboxes, assigned_scores = self.get_targets( + gt_labels, gt_bboxes, gt_idx_pre_prior, fg_mask_pre_prior, + num_priors, batch_size, num_gt) + + # soft label with iou + if pred_bboxes is not None: + ious = yolov6_iou_calculator(gt_bboxes, pred_bboxes) * pos_mask + ious = ious.max(axis=-2)[0].unsqueeze(-1) + assigned_scores *= ious + + assigned_result['assigned_labels'] = assigned_labels.long() + assigned_result['assigned_bboxes'] = assigned_bboxes + assigned_result['assigned_scores'] = assigned_scores + assigned_result['fg_mask_pre_prior'] = fg_mask_pre_prior.bool() + return assigned_result + + def select_topk_candidates(self, distances: Tensor, + num_level_priors: List[int], + pad_bbox_flag: Tensor) -> Tuple[Tensor, Tensor]: + """Selecting candidates based on the center distance. + + Args: + distances (Tensor): Distance between all bbox and gt, + shape(batch_size, num_gt, num_priors) + num_level_priors (List[int]): Number of bboxes in each level, + len(3) + pad_bbox_flag (Tensor): Ground truth bbox mask, + shape(batch_size, num_gt, 1) + + Return: + is_in_candidate_list (Tensor): Flag show that each level have + topk candidates or not, shape(batch_size, num_gt, num_priors) + candidate_idxs (Tensor): Candidates index, + shape(batch_size, num_gt, num_gt) + """ + is_in_candidate_list = [] + candidate_idxs = [] + start_idx = 0 + + distances_dtype = distances.dtype + distances = torch.split(distances, num_level_priors, dim=-1) + pad_bbox_flag = pad_bbox_flag.repeat(1, 1, self.topk).bool() + + for distances_per_level, priors_per_level in zip( + distances, num_level_priors): + # on each pyramid level, for each gt, + # select k bbox whose center are closest to the gt center + end_index = start_idx + priors_per_level + selected_k = min(self.topk, priors_per_level) + + _, topk_idxs_per_level = distances_per_level.topk( + selected_k, dim=-1, largest=False) + candidate_idxs.append(topk_idxs_per_level + start_idx) + + topk_idxs_per_level = torch.where( + pad_bbox_flag, topk_idxs_per_level, + torch.zeros_like(topk_idxs_per_level)) + + is_in_candidate = F.one_hot(topk_idxs_per_level, + priors_per_level).sum(dim=-2) + is_in_candidate = torch.where(is_in_candidate > 1, + torch.zeros_like(is_in_candidate), + is_in_candidate) + is_in_candidate_list.append(is_in_candidate.to(distances_dtype)) + + start_idx = end_index + + is_in_candidate_list = torch.cat(is_in_candidate_list, dim=-1) + candidate_idxs = torch.cat(candidate_idxs, dim=-1) + + return is_in_candidate_list, candidate_idxs + + @staticmethod + def threshold_calculator(is_in_candidate: List, candidate_idxs: Tensor, + overlaps: Tensor, num_priors: int, + batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor]: + """Get corresponding iou for the these candidates, and compute the mean + and std, set mean + std as the iou threshold. + + Args: + is_in_candidate (Tensor): Flag show that each level have + topk candidates or not, shape(batch_size, num_gt, num_priors). + candidate_idxs (Tensor): Candidates index, + shape(batch_size, num_gt, num_gt) + overlaps (Tensor): Overlaps area, + shape(batch_size, num_gt, num_priors). + num_priors (int): Number of priors. + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + + Return: + overlaps_thr_per_gt (Tensor): Overlap threshold of + per ground truth, shape(batch_size, num_gt, 1). + candidate_overlaps (Tensor): Candidate overlaps, + shape(batch_size, num_gt, num_priors). + """ + + batch_size_num_gt = batch_size * num_gt + candidate_overlaps = torch.where(is_in_candidate > 0, overlaps, + torch.zeros_like(overlaps)) + candidate_idxs = candidate_idxs.reshape([batch_size_num_gt, -1]) + + assist_indexes = num_priors * torch.arange( + batch_size_num_gt, device=candidate_idxs.device) + assist_indexes = assist_indexes[:, None] + flatten_indexes = candidate_idxs + assist_indexes + + candidate_overlaps_reshape = candidate_overlaps.reshape( + -1)[flatten_indexes] + candidate_overlaps_reshape = candidate_overlaps_reshape.reshape( + [batch_size, num_gt, -1]) + + overlaps_mean_per_gt = candidate_overlaps_reshape.mean( + axis=-1, keepdim=True) + overlaps_std_per_gt = candidate_overlaps_reshape.std( + axis=-1, keepdim=True) + overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt + + return overlaps_thr_per_gt, candidate_overlaps + + def get_targets(self, gt_labels: Tensor, gt_bboxes: Tensor, + assigned_gt_inds: Tensor, fg_mask_pre_prior: Tensor, + num_priors: int, batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """Get target info. + + Args: + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + assigned_gt_inds (Tensor): Assigned ground truth indexes, + shape(batch_size, num_priors) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + num_priors (int): Number of priors. + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + + Return: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned bboxes, + shape(batch_size, num_priors) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors) + """ + + # assigned target labels + batch_index = torch.arange( + batch_size, dtype=gt_labels.dtype, device=gt_labels.device) + batch_index = batch_index[..., None] + assigned_gt_inds = (assigned_gt_inds + batch_index * num_gt).long() + assigned_labels = gt_labels.flatten()[assigned_gt_inds.flatten()] + assigned_labels = assigned_labels.reshape([batch_size, num_priors]) + assigned_labels = torch.where( + fg_mask_pre_prior > 0, assigned_labels, + torch.full_like(assigned_labels, self.num_classes)) + + # assigned target boxes + assigned_bboxes = gt_bboxes.reshape([-1, + 4])[assigned_gt_inds.flatten()] + assigned_bboxes = assigned_bboxes.reshape([batch_size, num_priors, 4]) + + # assigned target scores + assigned_scores = F.one_hot(assigned_labels.long(), + self.num_classes + 1).float() + assigned_scores = assigned_scores[:, :, :self.num_classes] + + return assigned_labels, assigned_bboxes, assigned_scores diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..5ae0f80239590f9c906778e6e4c7c6b4bd10c488 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py @@ -0,0 +1,272 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.structures.bbox import BaseBoxes +from mmdet.utils import ConfigType +from torch import Tensor + +from mmyolo.registry import TASK_UTILS + +INF = 100000000 +EPS = 1.0e-7 + + +def find_inside_points(boxes: Tensor, + points: Tensor, + box_dim: int = 4, + eps: float = 0.01) -> Tensor: + """Find inside box points in batches. Boxes dimension must be 3. + + Args: + boxes (Tensor): Boxes tensor. Must be batch input. + Has shape of (batch_size, n_boxes, box_dim). + points (Tensor): Points coordinates. Has shape of (n_points, 2). + box_dim (int): The dimension of box. 4 means horizontal box and + 5 means rotated box. Defaults to 4. + eps (float): Make sure the points are inside not on the boundary. + Only use in rotated boxes. Defaults to 0.01. + + Returns: + Tensor: A BoolTensor indicating whether a point is inside + boxes. The index has shape of (n_points, batch_size, n_boxes). + """ + if box_dim == 4: + # Horizontal Boxes + lt_ = points[:, None, None] - boxes[..., :2] + rb_ = boxes[..., 2:] - points[:, None, None] + + deltas = torch.cat([lt_, rb_], dim=-1) + is_in_gts = deltas.min(dim=-1).values > 0 + + elif box_dim == 5: + # Rotated Boxes + points = points[:, None, None] + ctrs, wh, t = torch.split(boxes, [2, 2, 1], dim=-1) + cos_value, sin_value = torch.cos(t), torch.sin(t) + matrix = torch.cat([cos_value, sin_value, -sin_value, cos_value], + dim=-1).reshape(*boxes.shape[:-1], 2, 2) + + offset = points - ctrs + offset = torch.matmul(matrix, offset[..., None]) + offset = offset.squeeze(-1) + offset_x, offset_y = offset[..., 0], offset[..., 1] + w, h = wh[..., 0], wh[..., 1] + is_in_gts = (offset_x <= w / 2 - eps) & (offset_x >= - w / 2 + eps) & \ + (offset_y <= h / 2 - eps) & (offset_y >= - h / 2 + eps) + else: + raise NotImplementedError(f'Unsupport box_dim:{box_dim}') + + return is_in_gts + + +def get_box_center(boxes: Tensor, box_dim: int = 4) -> Tensor: + """Return a tensor representing the centers of boxes. + + Args: + boxes (Tensor): Boxes tensor. Has shape of (b, n, box_dim) + box_dim (int): The dimension of box. 4 means horizontal box and + 5 means rotated box. Defaults to 4. + + Returns: + Tensor: Centers have shape of (b, n, 2) + """ + if box_dim == 4: + # Horizontal Boxes, (x1, y1, x2, y2) + return (boxes[..., :2] + boxes[..., 2:]) / 2.0 + elif box_dim == 5: + # Rotated Boxes, (x, y, w, h, a) + return boxes[..., :2] + else: + raise NotImplementedError(f'Unsupported box_dim:{box_dim}') + + +@TASK_UTILS.register_module() +class BatchDynamicSoftLabelAssigner(nn.Module): + """Computes matching between predictions and ground truth with dynamic soft + label assignment. + + Args: + num_classes (int): number of class + soft_center_radius (float): Radius of the soft center prior. + Defaults to 3.0. + topk (int): Select top-k predictions to calculate dynamic k + best matches for each gt. Defaults to 13. + iou_weight (float): The scale factor of iou cost. Defaults to 3.0. + iou_calculator (ConfigType): Config of overlaps Calculator. + Defaults to dict(type='BboxOverlaps2D'). + batch_iou (bool): Use batch input when calculate IoU. + If set to False use loop instead. Defaults to True. + """ + + def __init__( + self, + num_classes, + soft_center_radius: float = 3.0, + topk: int = 13, + iou_weight: float = 3.0, + iou_calculator: ConfigType = dict(type='mmdet.BboxOverlaps2D'), + batch_iou: bool = True, + ) -> None: + super().__init__() + self.num_classes = num_classes + self.soft_center_radius = soft_center_radius + self.topk = topk + self.iou_weight = iou_weight + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.batch_iou = batch_iou + + @torch.no_grad() + def forward(self, pred_bboxes: Tensor, pred_scores: Tensor, priors: Tensor, + gt_labels: Tensor, gt_bboxes: Tensor, + pad_bbox_flag: Tensor) -> dict: + num_gt = gt_bboxes.size(1) + decoded_bboxes = pred_bboxes + batch_size, num_bboxes, box_dim = decoded_bboxes.size() + + if num_gt == 0 or num_bboxes == 0: + return { + 'assigned_labels': + gt_labels.new_full( + pred_scores[..., 0].shape, + self.num_classes, + dtype=torch.long), + 'assigned_labels_weights': + gt_bboxes.new_full(pred_scores[..., 0].shape, 1), + 'assigned_bboxes': + gt_bboxes.new_full(pred_bboxes.shape, 0), + 'assign_metrics': + gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + } + + prior_center = priors[:, :2] + if isinstance(gt_bboxes, BaseBoxes): + raise NotImplementedError( + f'type of {type(gt_bboxes)} are not implemented !') + else: + is_in_gts = find_inside_points(gt_bboxes, prior_center, box_dim) + + # (N_points, B, N_boxes) + is_in_gts = is_in_gts * pad_bbox_flag[..., 0][None] + # (N_points, B, N_boxes) -> (B, N_points, N_boxes) + is_in_gts = is_in_gts.permute(1, 0, 2) + # (B, N_points) + valid_mask = is_in_gts.sum(dim=-1) > 0 + + gt_center = get_box_center(gt_bboxes, box_dim) + + strides = priors[..., 2] + distance = (priors[None].unsqueeze(2)[..., :2] - + gt_center[:, None, :, :] + ).pow(2).sum(-1).sqrt() / strides[None, :, None] + + # prevent overflow + distance = distance * valid_mask.unsqueeze(-1) + soft_center_prior = torch.pow(10, distance - self.soft_center_radius) + + if self.batch_iou: + pairwise_ious = self.iou_calculator(decoded_bboxes, gt_bboxes) + else: + ious = [] + for box, gt in zip(decoded_bboxes, gt_bboxes): + iou = self.iou_calculator(box, gt) + ious.append(iou) + pairwise_ious = torch.stack(ious, dim=0) + + iou_cost = -torch.log(pairwise_ious + EPS) * self.iou_weight + + # select the predicted scores corresponded to the gt_labels + pairwise_pred_scores = pred_scores.permute(0, 2, 1) + idx = torch.zeros([2, batch_size, num_gt], dtype=torch.long) + idx[0] = torch.arange(end=batch_size).view(-1, 1).repeat(1, num_gt) + idx[1] = gt_labels.long().squeeze(-1) + pairwise_pred_scores = pairwise_pred_scores[idx[0], + idx[1]].permute(0, 2, 1) + # classification cost + scale_factor = pairwise_ious - pairwise_pred_scores.sigmoid() + pairwise_cls_cost = F.binary_cross_entropy_with_logits( + pairwise_pred_scores, pairwise_ious, + reduction='none') * scale_factor.abs().pow(2.0) + + cost_matrix = pairwise_cls_cost + iou_cost + soft_center_prior + + max_pad_value = torch.ones_like(cost_matrix) * INF + cost_matrix = torch.where(valid_mask[..., None].repeat(1, 1, num_gt), + cost_matrix, max_pad_value) + + (matched_pred_ious, matched_gt_inds, + fg_mask_inboxes) = self.dynamic_k_matching(cost_matrix, pairwise_ious, + pad_bbox_flag) + + del pairwise_ious, cost_matrix + + batch_index = (fg_mask_inboxes > 0).nonzero(as_tuple=True)[0] + + assigned_labels = gt_labels.new_full(pred_scores[..., 0].shape, + self.num_classes) + assigned_labels[fg_mask_inboxes] = gt_labels[ + batch_index, matched_gt_inds].squeeze(-1) + assigned_labels = assigned_labels.long() + + assigned_labels_weights = gt_bboxes.new_full(pred_scores[..., 0].shape, + 1) + + assigned_bboxes = gt_bboxes.new_full(pred_bboxes.shape, 0) + assigned_bboxes[fg_mask_inboxes] = gt_bboxes[batch_index, + matched_gt_inds] + + assign_metrics = gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + assign_metrics[fg_mask_inboxes] = matched_pred_ious + + return dict( + assigned_labels=assigned_labels, + assigned_labels_weights=assigned_labels_weights, + assigned_bboxes=assigned_bboxes, + assign_metrics=assign_metrics) + + def dynamic_k_matching( + self, cost_matrix: Tensor, pairwise_ious: Tensor, + pad_bbox_flag: int) -> Tuple[Tensor, Tensor, Tensor]: + """Use IoU and matching cost to calculate the dynamic top-k positive + targets. + + Args: + cost_matrix (Tensor): Cost matrix. + pairwise_ious (Tensor): Pairwise iou matrix. + num_gt (int): Number of gt. + valid_mask (Tensor): Mask for valid bboxes. + Returns: + tuple: matched ious and gt indexes. + """ + matching_matrix = torch.zeros_like(cost_matrix, dtype=torch.uint8) + # select candidate topk ious for dynamic-k calculation + candidate_topk = min(self.topk, pairwise_ious.size(1)) + topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=1) + # calculate dynamic k for each gt + dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1) + + num_gts = pad_bbox_flag.sum((1, 2)).int() + # sorting the batch cost matirx is faster than topk + _, sorted_indices = torch.sort(cost_matrix, dim=1) + for b in range(pad_bbox_flag.shape[0]): + for gt_idx in range(num_gts[b]): + topk_ids = sorted_indices[b, :dynamic_ks[b, gt_idx], gt_idx] + matching_matrix[b, :, gt_idx][topk_ids] = 1 + + del topk_ious, dynamic_ks + + prior_match_gt_mask = matching_matrix.sum(2) > 1 + if prior_match_gt_mask.sum() > 0: + cost_min, cost_argmin = torch.min( + cost_matrix[prior_match_gt_mask, :], dim=1) + matching_matrix[prior_match_gt_mask, :] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin] = 1 + + # get foreground mask inside box and center prior + fg_mask_inboxes = matching_matrix.sum(2) > 0 + matched_pred_ious = (matching_matrix * + pairwise_ious).sum(2)[fg_mask_inboxes] + matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) + return matched_pred_ious, matched_gt_inds, fg_mask_inboxes diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..202d678986c3a398de63675c004592b98ea092e0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py @@ -0,0 +1,311 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from mmyolo.models.losses import bbox_overlaps +from mmyolo.registry import TASK_UTILS +from .utils import (select_candidates_in_gts, select_highest_overlaps, + yolov6_iou_calculator) + + +@TASK_UTILS.register_module() +class BatchTaskAlignedAssigner(nn.Module): + """This code referenced to + https://github.com/meituan/YOLOv6/blob/main/yolov6/ + assigners/tal_assigner.py. + Batch Task aligned assigner base on the paper: + `TOOD: Task-aligned One-stage Object Detection. + `_. + Assign a corresponding gt bboxes or background to a batch of + predicted bboxes. Each bbox will be assigned with `0` or a + positive integer indicating the ground truth index. + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + Args: + num_classes (int): number of class + topk (int): number of bbox selected in each level + alpha (float): Hyper-parameters related to alignment_metrics. + Defaults to 1.0 + beta (float): Hyper-parameters related to alignment_metrics. + Defaults to 6. + eps (float): Eps to avoid log(0). Default set to 1e-9 + use_ciou (bool): Whether to use ciou while calculating iou. + Defaults to False. + """ + + def __init__(self, + num_classes: int, + topk: int = 13, + alpha: float = 1.0, + beta: float = 6.0, + eps: float = 1e-7, + use_ciou: bool = False): + super().__init__() + self.num_classes = num_classes + self.topk = topk + self.alpha = alpha + self.beta = beta + self.eps = eps + self.use_ciou = use_ciou + + @torch.no_grad() + def forward( + self, + pred_bboxes: Tensor, + pred_scores: Tensor, + priors: Tensor, + gt_labels: Tensor, + gt_bboxes: Tensor, + pad_bbox_flag: Tensor, + ) -> dict: + """Assign gt to bboxes. + + The assignment is done in following steps + 1. compute alignment metric between all bbox (bbox of all pyramid + levels) and gt + 2. select top-k bbox as candidates for each gt + 3. limit the positive sample's center in gt (because the anchor-free + detector only can predict positive distance) + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bboxes, + shape(batch_size, num_priors, num_classes) + priors (Tensor): Model priors, shape (num_priors, 4) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + Returns: + assigned_result (dict) Assigned result: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned boxes, + shape(batch_size, num_priors, 4) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors, num_classes) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + """ + # (num_priors, 4) -> (num_priors, 2) + priors = priors[:, :2] + + batch_size = pred_scores.size(0) + num_gt = gt_bboxes.size(1) + + assigned_result = { + 'assigned_labels': + gt_bboxes.new_full(pred_scores[..., 0].shape, self.num_classes), + 'assigned_bboxes': + gt_bboxes.new_full(pred_bboxes.shape, 0), + 'assigned_scores': + gt_bboxes.new_full(pred_scores.shape, 0), + 'fg_mask_pre_prior': + gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + } + + if num_gt == 0: + return assigned_result + + pos_mask, alignment_metrics, overlaps = self.get_pos_mask( + pred_bboxes, pred_scores, priors, gt_labels, gt_bboxes, + pad_bbox_flag, batch_size, num_gt) + + (assigned_gt_idxs, fg_mask_pre_prior, + pos_mask) = select_highest_overlaps(pos_mask, overlaps, num_gt) + + # assigned target + assigned_labels, assigned_bboxes, assigned_scores = self.get_targets( + gt_labels, gt_bboxes, assigned_gt_idxs, fg_mask_pre_prior, + batch_size, num_gt) + + # normalize + alignment_metrics *= pos_mask + pos_align_metrics = alignment_metrics.max(axis=-1, keepdim=True)[0] + pos_overlaps = (overlaps * pos_mask).max(axis=-1, keepdim=True)[0] + norm_align_metric = ( + alignment_metrics * pos_overlaps / + (pos_align_metrics + self.eps)).max(-2)[0].unsqueeze(-1) + assigned_scores = assigned_scores * norm_align_metric + + assigned_result['assigned_labels'] = assigned_labels + assigned_result['assigned_bboxes'] = assigned_bboxes + assigned_result['assigned_scores'] = assigned_scores + assigned_result['fg_mask_pre_prior'] = fg_mask_pre_prior.bool() + return assigned_result + + def get_pos_mask(self, pred_bboxes: Tensor, pred_scores: Tensor, + priors: Tensor, gt_labels: Tensor, gt_bboxes: Tensor, + pad_bbox_flag: Tensor, batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """Get possible mask. + + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bbox, + shape(batch_size, num_priors, num_classes) + priors (Tensor): Model priors, shape (num_priors, 2) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + Returns: + pos_mask (Tensor): Possible mask, + shape(batch_size, num_gt, num_priors) + alignment_metrics (Tensor): Alignment metrics, + shape(batch_size, num_gt, num_priors) + overlaps (Tensor): Overlaps of gt_bboxes and pred_bboxes, + shape(batch_size, num_gt, num_priors) + """ + + # Compute alignment metric between all bbox and gt + alignment_metrics, overlaps = \ + self.get_box_metrics(pred_bboxes, pred_scores, gt_labels, + gt_bboxes, batch_size, num_gt) + + # get is_in_gts mask + is_in_gts = select_candidates_in_gts(priors, gt_bboxes) + + # get topk_metric mask + topk_metric = self.select_topk_candidates( + alignment_metrics * is_in_gts, + topk_mask=pad_bbox_flag.repeat([1, 1, self.topk]).bool()) + + # merge all mask to a final mask + pos_mask = topk_metric * is_in_gts * pad_bbox_flag + + return pos_mask, alignment_metrics, overlaps + + def get_box_metrics(self, pred_bboxes: Tensor, pred_scores: Tensor, + gt_labels: Tensor, gt_bboxes: Tensor, batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor]: + """Compute alignment metric between all bbox and gt. + + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bbox, + shape(batch_size, num_priors, num_classes) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + Returns: + alignment_metrics (Tensor): Align metric, + shape(batch_size, num_gt, num_priors) + overlaps (Tensor): Overlaps, shape(batch_size, num_gt, num_priors) + """ + pred_scores = pred_scores.permute(0, 2, 1) + gt_labels = gt_labels.to(torch.long) + idx = torch.zeros([2, batch_size, num_gt], dtype=torch.long) + idx[0] = torch.arange(end=batch_size).view(-1, 1).repeat(1, num_gt) + idx[1] = gt_labels.squeeze(-1) + bbox_scores = pred_scores[idx[0], idx[1]] + # TODO: need to replace the yolov6_iou_calculator function + if self.use_ciou: + overlaps = bbox_overlaps( + pred_bboxes.unsqueeze(1), + gt_bboxes.unsqueeze(2), + iou_mode='ciou', + bbox_format='xyxy').clamp(0) + else: + overlaps = yolov6_iou_calculator(gt_bboxes, pred_bboxes) + + alignment_metrics = bbox_scores.pow(self.alpha) * overlaps.pow( + self.beta) + + return alignment_metrics, overlaps + + def select_topk_candidates(self, + alignment_gt_metrics: Tensor, + using_largest_topk: bool = True, + topk_mask: Optional[Tensor] = None) -> Tensor: + """Compute alignment metric between all bbox and gt. + + Args: + alignment_gt_metrics (Tensor): Alignment metric of gt candidates, + shape(batch_size, num_gt, num_priors) + using_largest_topk (bool): Controls whether to using largest or + smallest elements. + topk_mask (Tensor): Topk mask, + shape(batch_size, num_gt, self.topk) + Returns: + Tensor: Topk candidates mask, + shape(batch_size, num_gt, num_priors) + """ + num_priors = alignment_gt_metrics.shape[-1] + topk_metrics, topk_idxs = torch.topk( + alignment_gt_metrics, + self.topk, + axis=-1, + largest=using_largest_topk) + if topk_mask is None: + topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > + self.eps).tile([1, 1, self.topk]) + topk_idxs = torch.where(topk_mask, topk_idxs, + torch.zeros_like(topk_idxs)) + is_in_topk = F.one_hot(topk_idxs, num_priors).sum(axis=-2) + is_in_topk = torch.where(is_in_topk > 1, torch.zeros_like(is_in_topk), + is_in_topk) + return is_in_topk.to(alignment_gt_metrics.dtype) + + def get_targets(self, gt_labels: Tensor, gt_bboxes: Tensor, + assigned_gt_idxs: Tensor, fg_mask_pre_prior: Tensor, + batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """Get assigner info. + + Args: + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + assigned_gt_idxs (Tensor): Assigned ground truth indexes, + shape(batch_size, num_priors) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + Returns: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned bboxes, + shape(batch_size, num_priors) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors) + """ + # assigned target labels + batch_ind = torch.arange( + end=batch_size, dtype=torch.int64, device=gt_labels.device)[..., + None] + assigned_gt_idxs = assigned_gt_idxs + batch_ind * num_gt + assigned_labels = gt_labels.long().flatten()[assigned_gt_idxs] + + # assigned target boxes + assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_idxs] + + # assigned target scores + assigned_labels[assigned_labels < 0] = 0 + assigned_scores = F.one_hot(assigned_labels, self.num_classes) + force_gt_scores_mask = fg_mask_pre_prior[:, :, None].repeat( + 1, 1, self.num_classes) + assigned_scores = torch.where(force_gt_scores_mask > 0, + assigned_scores, + torch.full_like(assigned_scores, 0)) + + return assigned_labels, assigned_bboxes, assigned_scores diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..6709968eeb1768fc4e6124f1f7a344f581dd43a7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py @@ -0,0 +1,344 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_overlaps + + +def _cat_multi_level_tensor_in_place(*multi_level_tensor, place_hold_var): + """concat multi-level tensor in place.""" + for level_tensor in multi_level_tensor: + for i, var in enumerate(level_tensor): + if len(var) > 0: + level_tensor[i] = torch.cat(var, dim=0) + else: + level_tensor[i] = place_hold_var + + +class BatchYOLOv7Assigner(nn.Module): + """Batch YOLOv7 Assigner. + + It consists of two assigning steps: + + 1. YOLOv5 cross-grid sample assigning + 2. SimOTA assigning + + This code referenced to + https://github.com/WongKinYiu/yolov7/blob/main/utils/loss.py. + + Args: + num_classes (int): Number of classes. + num_base_priors (int): Number of base priors. + featmap_strides (Sequence[int]): Feature map strides. + prior_match_thr (float): Threshold to match priors. + Defaults to 4.0. + candidate_topk (int): Number of topk candidates to + assign. Defaults to 10. + iou_weight (float): IOU weight. Defaults to 3.0. + cls_weight (float): Class weight. Defaults to 1.0. + """ + + def __init__(self, + num_classes: int, + num_base_priors: int, + featmap_strides: Sequence[int], + prior_match_thr: float = 4.0, + candidate_topk: int = 10, + iou_weight: float = 3.0, + cls_weight: float = 1.0): + super().__init__() + self.num_classes = num_classes + self.num_base_priors = num_base_priors + self.featmap_strides = featmap_strides + # yolov5 param + self.prior_match_thr = prior_match_thr + # simota param + self.candidate_topk = candidate_topk + self.iou_weight = iou_weight + self.cls_weight = cls_weight + + @torch.no_grad() + def forward(self, + pred_results, + batch_targets_normed, + batch_input_shape, + priors_base_sizes, + grid_offset, + near_neighbor_thr=0.5) -> dict: + """Forward function.""" + # (num_base_priors, num_batch_gt, 7) + # 7 is mean (batch_idx, cls_id, x_norm, y_norm, + # w_norm, h_norm, prior_idx) + + # mlvl is mean multi_level + if batch_targets_normed.shape[1] == 0: + # empty gt of batch + num_levels = len(pred_results) + return dict( + mlvl_positive_infos=[pred_results[0].new_empty( + (0, 4))] * num_levels, + mlvl_priors=[] * num_levels, + mlvl_targets_normed=[] * num_levels) + + # if near_neighbor_thr = 0.5 are mean the nearest + # 3 neighbors are also considered positive samples. + # if near_neighbor_thr = 1.0 are mean the nearest + # 5 neighbors are also considered positive samples. + mlvl_positive_infos, mlvl_priors = self.yolov5_assigner( + pred_results, + batch_targets_normed, + priors_base_sizes, + grid_offset, + near_neighbor_thr=near_neighbor_thr) + + mlvl_positive_infos, mlvl_priors, \ + mlvl_targets_normed = self.simota_assigner( + pred_results, batch_targets_normed, mlvl_positive_infos, + mlvl_priors, batch_input_shape) + + place_hold_var = batch_targets_normed.new_empty((0, 4)) + _cat_multi_level_tensor_in_place( + mlvl_positive_infos, + mlvl_priors, + mlvl_targets_normed, + place_hold_var=place_hold_var) + + return dict( + mlvl_positive_infos=mlvl_positive_infos, + mlvl_priors=mlvl_priors, + mlvl_targets_normed=mlvl_targets_normed) + + def yolov5_assigner(self, + pred_results, + batch_targets_normed, + priors_base_sizes, + grid_offset, + near_neighbor_thr=0.5): + """YOLOv5 cross-grid sample assigner.""" + num_batch_gts = batch_targets_normed.shape[1] + assert num_batch_gts > 0 + + mlvl_positive_infos, mlvl_priors = [], [] + + scaled_factor = torch.ones(7, device=pred_results[0].device) + for i in range(len(pred_results)): # lever + priors_base_sizes_i = priors_base_sizes[i] + # (1, 1, feat_shape_w, feat_shape_h, feat_shape_w, feat_shape_h) + scaled_factor[2:6] = torch.tensor( + pred_results[i].shape)[[3, 2, 3, 2]] + + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_batch_gts, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1. / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[ + match_inds] # (num_matched_target, 7) + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + mlvl_positive_infos.append( + batch_targets_scaled.new_empty((0, 4))) + mlvl_priors.append([]) + continue + + # Positive samples with additional neighbors + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] # () + retained_offsets = grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # batch_targets_scaled: (num_matched_target, 7) + # 7 is mean (batch_idx, cls_id, x_scaled, + # y_scaled, w_scaled, h_scaled, prior_idx) + + # mlvl_positive_info: (num_matched_target, 4) + # 4 is mean (batch_idx, prior_idx, x_scaled, y_scaled) + mlvl_positive_info = batch_targets_scaled[:, [0, 6, 2, 3]] + retained_offsets = retained_offsets * near_neighbor_thr + mlvl_positive_info[:, + 2:] = mlvl_positive_info[:, + 2:] - retained_offsets + mlvl_positive_info[:, 2].clamp_(0, scaled_factor[2] - 1) + mlvl_positive_info[:, 3].clamp_(0, scaled_factor[3] - 1) + mlvl_positive_info = mlvl_positive_info.long() + priors_inds = mlvl_positive_info[:, 1] + + mlvl_positive_infos.append(mlvl_positive_info) + mlvl_priors.append(priors_base_sizes_i[priors_inds]) + + return mlvl_positive_infos, mlvl_priors + + def simota_assigner(self, pred_results, batch_targets_normed, + mlvl_positive_infos, mlvl_priors, batch_input_shape): + """SimOTA assigner.""" + num_batch_gts = batch_targets_normed.shape[1] + assert num_batch_gts > 0 + num_levels = len(mlvl_positive_infos) + + mlvl_positive_infos_matched = [[] for _ in range(num_levels)] + mlvl_priors_matched = [[] for _ in range(num_levels)] + mlvl_targets_normed_matched = [[] for _ in range(num_levels)] + + for batch_idx in range(pred_results[0].shape[0]): + # (num_batch_gt, 7) + # 7 is mean (batch_idx, cls_id, x_norm, y_norm, + # w_norm, h_norm, prior_idx) + targets_normed = batch_targets_normed[0] + # (num_gt, 7) + targets_normed = targets_normed[targets_normed[:, 0] == batch_idx] + num_gts = targets_normed.shape[0] + + if num_gts == 0: + continue + + _mlvl_decoderd_bboxes = [] + _mlvl_obj_cls = [] + _mlvl_priors = [] + _mlvl_positive_infos = [] + _from_which_layer = [] + + for i, head_pred in enumerate(pred_results): + # (num_matched_target, 4) + # 4 is mean (batch_idx, prior_idx, grid_x, grid_y) + _mlvl_positive_info = mlvl_positive_infos[i] + if _mlvl_positive_info.shape[0] == 0: + continue + + idx = (_mlvl_positive_info[:, 0] == batch_idx) + _mlvl_positive_info = _mlvl_positive_info[idx] + _mlvl_positive_infos.append(_mlvl_positive_info) + + priors = mlvl_priors[i][idx] + _mlvl_priors.append(priors) + + _from_which_layer.append( + _mlvl_positive_info.new_full( + size=(_mlvl_positive_info.shape[0], ), fill_value=i)) + + # (n,85) + level_batch_idx, prior_ind, \ + grid_x, grid_y = _mlvl_positive_info.T + pred_positive = head_pred[level_batch_idx, prior_ind, grid_y, + grid_x] + _mlvl_obj_cls.append(pred_positive[:, 4:]) + + # decoded + grid = torch.stack([grid_x, grid_y], dim=1) + pred_positive_cxcy = (pred_positive[:, :2].sigmoid() * 2. - + 0.5 + grid) * self.featmap_strides[i] + pred_positive_wh = (pred_positive[:, 2:4].sigmoid() * 2) ** 2 \ + * priors * self.featmap_strides[i] + pred_positive_xywh = torch.cat( + [pred_positive_cxcy, pred_positive_wh], dim=-1) + _mlvl_decoderd_bboxes.append(pred_positive_xywh) + + if len(_mlvl_decoderd_bboxes) == 0: + continue + + # 1 calc pair_wise_iou_loss + _mlvl_decoderd_bboxes = torch.cat(_mlvl_decoderd_bboxes, dim=0) + num_pred_positive = _mlvl_decoderd_bboxes.shape[0] + + if num_pred_positive == 0: + continue + + # scaled xywh + batch_input_shape_wh = pred_results[0].new_tensor( + batch_input_shape[::-1]).repeat((1, 2)) + targets_scaled_bbox = targets_normed[:, 2:6] * batch_input_shape_wh + + targets_scaled_bbox = bbox_cxcywh_to_xyxy(targets_scaled_bbox) + _mlvl_decoderd_bboxes = bbox_cxcywh_to_xyxy(_mlvl_decoderd_bboxes) + pair_wise_iou = bbox_overlaps(targets_scaled_bbox, + _mlvl_decoderd_bboxes) + pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8) + + # 2 calc pair_wise_cls_loss + _mlvl_obj_cls = torch.cat(_mlvl_obj_cls, dim=0).float().sigmoid() + _mlvl_positive_infos = torch.cat(_mlvl_positive_infos, dim=0) + _from_which_layer = torch.cat(_from_which_layer, dim=0) + _mlvl_priors = torch.cat(_mlvl_priors, dim=0) + + gt_cls_per_image = ( + F.one_hot(targets_normed[:, 1].to(torch.int64), + self.num_classes).float().unsqueeze(1).repeat( + 1, num_pred_positive, 1)) + # cls_score * obj + cls_preds_ = _mlvl_obj_cls[:, 1:]\ + .unsqueeze(0)\ + .repeat(num_gts, 1, 1) \ + * _mlvl_obj_cls[:, 0:1]\ + .unsqueeze(0).repeat(num_gts, 1, 1) + y = cls_preds_.sqrt_() + pair_wise_cls_loss = F.binary_cross_entropy_with_logits( + torch.log(y / (1 - y)), gt_cls_per_image, + reduction='none').sum(-1) + del cls_preds_ + + # calc cost + cost = ( + self.cls_weight * pair_wise_cls_loss + + self.iou_weight * pair_wise_iou_loss) + + # num_gt, num_match_pred + matching_matrix = torch.zeros_like(cost) + + top_k, _ = torch.topk( + pair_wise_iou, + min(self.candidate_topk, pair_wise_iou.shape[1]), + dim=1) + dynamic_ks = torch.clamp(top_k.sum(1).int(), min=1) + + # Select only topk matches per gt + for gt_idx in range(num_gts): + _, pos_idx = torch.topk( + cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False) + matching_matrix[gt_idx][pos_idx] = 1.0 + del top_k, dynamic_ks + + # Each prediction box can match at most one gt box, + # and if there are more than one, + # only the least costly one can be taken + anchor_matching_gt = matching_matrix.sum(0) + if (anchor_matching_gt > 1).sum() > 0: + _, cost_argmin = torch.min( + cost[:, anchor_matching_gt > 1], dim=0) + matching_matrix[:, anchor_matching_gt > 1] *= 0.0 + matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0 + fg_mask_inboxes = matching_matrix.sum(0) > 0.0 + matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) + + targets_normed = targets_normed[matched_gt_inds] + _mlvl_positive_infos = _mlvl_positive_infos[fg_mask_inboxes] + _from_which_layer = _from_which_layer[fg_mask_inboxes] + _mlvl_priors = _mlvl_priors[fg_mask_inboxes] + + # Rearranged in the order of the prediction layers + # to facilitate loss + for i in range(num_levels): + layer_idx = _from_which_layer == i + mlvl_positive_infos_matched[i].append( + _mlvl_positive_infos[layer_idx]) + mlvl_priors_matched[i].append(_mlvl_priors[layer_idx]) + mlvl_targets_normed_matched[i].append( + targets_normed[layer_idx]) + + results = mlvl_positive_infos_matched, \ + mlvl_priors_matched, \ + mlvl_targets_normed_matched + return results diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/pose_sim_ota_assigner.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/pose_sim_ota_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..e66a9bf157aceceadb2f228cbbcb3ff1ddc00196 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/pose_sim_ota_assigner.py @@ -0,0 +1,210 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from mmdet.models.task_modules.assigners import AssignResult, SimOTAAssigner +from mmdet.utils import ConfigType +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS + +INF = 100000.0 +EPS = 1.0e-7 + + +@TASK_UTILS.register_module() +class PoseSimOTAAssigner(SimOTAAssigner): + + def __init__(self, + center_radius: float = 2.5, + candidate_topk: int = 10, + iou_weight: float = 3.0, + cls_weight: float = 1.0, + oks_weight: float = 0.0, + vis_weight: float = 0.0, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D'), + oks_calculator: ConfigType = dict(type='OksLoss')): + + self.center_radius = center_radius + self.candidate_topk = candidate_topk + self.iou_weight = iou_weight + self.cls_weight = cls_weight + self.oks_weight = oks_weight + self.vis_weight = vis_weight + + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.oks_calculator = MODELS.build(oks_calculator) + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to priors using SimOTA. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + Returns: + obj:`AssignResult`: The assigned result. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + gt_keypoints = gt_instances.keypoints + gt_keypoints_visible = gt_instances.keypoints_visible + num_gt = gt_bboxes.size(0) + + decoded_bboxes = pred_instances.bboxes[..., :4] + pred_kpts = pred_instances.bboxes[..., 4:] + pred_kpts = pred_kpts.reshape(*pred_kpts.shape[:-1], -1, 3) + pred_kpts_vis = pred_kpts[..., -1] + pred_kpts = pred_kpts[..., :2] + pred_scores = pred_instances.scores + priors = pred_instances.priors + num_bboxes = decoded_bboxes.size(0) + + # assign 0 by default + assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ), + 0, + dtype=torch.long) + if num_gt == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info( + priors, gt_bboxes) + valid_decoded_bbox = decoded_bboxes[valid_mask] + valid_pred_scores = pred_scores[valid_mask] + valid_pred_kpts = pred_kpts[valid_mask] + valid_pred_kpts_vis = pred_kpts_vis[valid_mask] + num_valid = valid_decoded_bbox.size(0) + if num_valid == 0: + # No valid bboxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + cost_matrix = (~is_in_boxes_and_center) * INF + + # calculate iou + pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes) + if self.iou_weight > 0: + iou_cost = -torch.log(pairwise_ious + EPS) + cost_matrix = cost_matrix + iou_cost * self.iou_weight + + # calculate oks + pairwise_oks = self.oks_calculator.compute_oks( + valid_pred_kpts.unsqueeze(1), # [num_valid, -1, k, 2] + gt_keypoints.unsqueeze(0), # [1, num_gt, k, 2] + gt_keypoints_visible.unsqueeze(0), # [1, num_gt, k] + bboxes=gt_bboxes.unsqueeze(0), # [1, num_gt, 4] + ) # -> [num_valid, num_gt] + if self.oks_weight > 0: + oks_cost = -torch.log(pairwise_oks + EPS) + cost_matrix = cost_matrix + oks_cost * self.oks_weight + + # calculate cls + if self.cls_weight > 0: + gt_onehot_label = ( + F.one_hot(gt_labels.to(torch.int64), + pred_scores.shape[-1]).float().unsqueeze(0).repeat( + num_valid, 1, 1)) + + valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat( + 1, num_gt, 1) + # disable AMP autocast to avoid overflow + with torch.cuda.amp.autocast(enabled=False): + cls_cost = ( + F.binary_cross_entropy( + valid_pred_scores.to(dtype=torch.float32), + gt_onehot_label, + reduction='none', + ).sum(-1).to(dtype=valid_pred_scores.dtype)) + cost_matrix = cost_matrix + cls_cost * self.cls_weight + + # calculate vis + if self.vis_weight > 0: + valid_pred_kpts_vis = valid_pred_kpts_vis.sigmoid().unsqueeze( + 1).repeat(1, num_gt, 1) # [num_valid, 1, k] + gt_kpt_vis = gt_keypoints_visible.unsqueeze( + 0).float() # [1, num_gt, k] + with torch.cuda.amp.autocast(enabled=False): + vis_cost = ( + F.binary_cross_entropy( + valid_pred_kpts_vis.to(dtype=torch.float32), + gt_kpt_vis.repeat(num_valid, 1, 1), + reduction='none', + ).sum(-1).to(dtype=valid_pred_kpts_vis.dtype)) + cost_matrix = cost_matrix + vis_cost * self.vis_weight + + # mixed metric + pairwise_oks = pairwise_oks.pow(0.5) + matched_pred_oks, matched_gt_inds = \ + self.dynamic_k_matching( + cost_matrix, pairwise_ious, pairwise_oks, num_gt, valid_mask) + + # convert to AssignResult format + assigned_gt_inds[valid_mask] = matched_gt_inds + 1 + assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1) + assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long() + max_overlaps = assigned_gt_inds.new_full((num_bboxes, ), + -INF, + dtype=torch.float32) + max_overlaps[valid_mask] = matched_pred_oks + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor, + pairwise_oks: Tensor, num_gt: int, + valid_mask: Tensor) -> Tuple[Tensor, Tensor]: + """Use IoU and matching cost to calculate the dynamic top-k positive + targets.""" + matching_matrix = torch.zeros_like(cost, dtype=torch.uint8) + # select candidate topk ious for dynamic-k calculation + candidate_topk = min(self.candidate_topk, pairwise_ious.size(0)) + topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0) + # calculate dynamic k for each gt + dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1) + for gt_idx in range(num_gt): + _, pos_idx = torch.topk( + cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False) + matching_matrix[:, gt_idx][pos_idx] = 1 + + del topk_ious, dynamic_ks, pos_idx + + prior_match_gt_mask = matching_matrix.sum(1) > 1 + if prior_match_gt_mask.sum() > 0: + cost_min, cost_argmin = torch.min( + cost[prior_match_gt_mask, :], dim=1) + matching_matrix[prior_match_gt_mask, :] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin] = 1 + # get foreground mask inside box and center prior + fg_mask_inboxes = matching_matrix.sum(1) > 0 + valid_mask[valid_mask.clone()] = fg_mask_inboxes + + matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) + matched_pred_oks = (matching_matrix * + pairwise_oks).sum(1)[fg_mask_inboxes] + return matched_pred_oks, matched_gt_inds diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/utils.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5843200936ef7a269109517e6d2952cceea02059 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/assigners/utils.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor + + +def select_candidates_in_gts(priors_points: Tensor, + gt_bboxes: Tensor, + eps: float = 1e-9) -> Tensor: + """Select the positive priors' center in gt. + + Args: + priors_points (Tensor): Model priors points, + shape(num_priors, 2) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + eps (float): Default to 1e-9. + Return: + (Tensor): shape(batch_size, num_gt, num_priors) + """ + batch_size, num_gt, _ = gt_bboxes.size() + gt_bboxes = gt_bboxes.reshape([-1, 4]) + + priors_number = priors_points.size(0) + priors_points = priors_points.unsqueeze(0).repeat(batch_size * num_gt, 1, + 1) + + # calculate the left, top, right, bottom distance between positive + # prior center and gt side + gt_bboxes_lt = gt_bboxes[:, 0:2].unsqueeze(1).repeat(1, priors_number, 1) + gt_bboxes_rb = gt_bboxes[:, 2:4].unsqueeze(1).repeat(1, priors_number, 1) + bbox_deltas = torch.cat( + [priors_points - gt_bboxes_lt, gt_bboxes_rb - priors_points], dim=-1) + bbox_deltas = bbox_deltas.reshape([batch_size, num_gt, priors_number, -1]) + + return (bbox_deltas.min(axis=-1)[0] > eps).to(gt_bboxes.dtype) + + +def select_highest_overlaps(pos_mask: Tensor, overlaps: Tensor, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """If an anchor box is assigned to multiple gts, the one with the highest + iou will be selected. + + Args: + pos_mask (Tensor): The assigned positive sample mask, + shape(batch_size, num_gt, num_priors) + overlaps (Tensor): IoU between all bbox and ground truth, + shape(batch_size, num_gt, num_priors) + num_gt (int): Number of ground truth. + Return: + gt_idx_pre_prior (Tensor): Target ground truth index, + shape(batch_size, num_priors) + fg_mask_pre_prior (Tensor): Force matching ground truth, + shape(batch_size, num_priors) + pos_mask (Tensor): The assigned positive sample mask, + shape(batch_size, num_gt, num_priors) + """ + fg_mask_pre_prior = pos_mask.sum(axis=-2) + + # Make sure the positive sample matches the only one and is the largest IoU + if fg_mask_pre_prior.max() > 1: + mask_multi_gts = (fg_mask_pre_prior.unsqueeze(1) > 1).repeat( + [1, num_gt, 1]) + index = overlaps.argmax(axis=1) + is_max_overlaps = F.one_hot(index, num_gt) + is_max_overlaps = \ + is_max_overlaps.permute(0, 2, 1).to(overlaps.dtype) + + pos_mask = torch.where(mask_multi_gts, is_max_overlaps, pos_mask) + fg_mask_pre_prior = pos_mask.sum(axis=-2) + + gt_idx_pre_prior = pos_mask.argmax(axis=-2) + return gt_idx_pre_prior, fg_mask_pre_prior, pos_mask + + +# TODO:'mmdet.BboxOverlaps2D' will cause gradient inconsistency, +# which will be found and solved in a later version. +def yolov6_iou_calculator(bbox1: Tensor, + bbox2: Tensor, + eps: float = 1e-9) -> Tensor: + """Calculate iou for batch. + + Args: + bbox1 (Tensor): shape(batch size, num_gt, 4) + bbox2 (Tensor): shape(batch size, num_priors, 4) + eps (float): Default to 1e-9. + Return: + (Tensor): IoU, shape(size, num_gt, num_priors) + """ + bbox1 = bbox1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4] + bbox2 = bbox2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4] + + # calculate xy info of predict and gt bbox + bbox1_x1y1, bbox1_x2y2 = bbox1[:, :, :, 0:2], bbox1[:, :, :, 2:4] + bbox2_x1y1, bbox2_x2y2 = bbox2[:, :, :, 0:2], bbox2[:, :, :, 2:4] + + # calculate overlap area + overlap = (torch.minimum(bbox1_x2y2, bbox2_x2y2) - + torch.maximum(bbox1_x1y1, bbox2_x1y1)).clip(0).prod(-1) + + # calculate bbox area + bbox1_area = (bbox1_x2y2 - bbox1_x1y1).clip(0).prod(-1) + bbox2_area = (bbox2_x2y2 - bbox2_x1y1).clip(0).prod(-1) + + union = bbox1_area + bbox2_area - overlap + eps + + return overlap / union diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..75b6e7d6b30afd3de21c738dfc8e75df2eae7120 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .distance_angle_point_coder import DistanceAnglePointCoder +from .distance_point_bbox_coder import DistancePointBBoxCoder +from .yolov5_bbox_coder import YOLOv5BBoxCoder +from .yolox_bbox_coder import YOLOXBBoxCoder + +__all__ = [ + 'YOLOv5BBoxCoder', 'YOLOXBBoxCoder', 'DistancePointBBoxCoder', + 'DistanceAnglePointCoder' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_angle_point_coder.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_angle_point_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..a7e322f94725ee548c9b261be6f5bae2f3d9b4d9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_angle_point_coder.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import torch + +from mmyolo.registry import TASK_UTILS + +try: + from mmrotate.models.task_modules.coders import \ + DistanceAnglePointCoder as MMROTATE_DistanceAnglePointCoder + MMROTATE_AVAILABLE = True +except ImportError: + from mmdet.models.task_modules.coders import BaseBBoxCoder + MMROTATE_DistanceAnglePointCoder = BaseBBoxCoder + MMROTATE_AVAILABLE = False + + +@TASK_UTILS.register_module() +class DistanceAnglePointCoder(MMROTATE_DistanceAnglePointCoder): + """Distance Angle Point BBox coder. + + This coder encodes gt bboxes (x, y, w, h, theta) into (top, bottom, left, + right, theta) and decode it back to the original. + """ + + def __init__(self, clip_border=True, angle_version='oc'): + if not MMROTATE_AVAILABLE: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + super().__init__(clip_border=clip_border, angle_version=angle_version) + + def decode( + self, + points: torch.Tensor, + pred_bboxes: torch.Tensor, + stride: torch.Tensor, + max_shape: Optional[Union[Sequence[int], torch.Tensor, + Sequence[Sequence[int]]]] = None, + ) -> torch.Tensor: + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2). + pred_bboxes (Tensor): Distance from the given point to 4 + boundaries and angle (left, top, right, bottom, angle). + Shape (B, N, 5) or (N, 5) + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]], + and the length of max_shape should also be B. + Default None. + Returns: + Tensor: Boxes with shape (N, 5) or (B, N, 5) + """ + assert points.size(-2) == pred_bboxes.size(-2) + assert points.size(-1) == 2 + assert pred_bboxes.size(-1) == 5 + if self.clip_border is False: + max_shape = None + + if pred_bboxes.dim() == 2: + stride = stride[:, None] + else: + stride = stride[None, :, None] + pred_bboxes[..., :4] = pred_bboxes[..., :4] * stride + + return self.distance2obb(points, pred_bboxes, max_shape, + self.angle_version) + + def encode(self, + points: torch.Tensor, + gt_bboxes: torch.Tensor, + max_dis: float = 16., + eps: float = 0.01) -> torch.Tensor: + """Encode bounding box to distances. + + Args: + points (Tensor): Shape (N, 2), The format is [x, y]. + gt_bboxes (Tensor): Shape (N, 5), The format is "xywha" + max_dis (float): Upper bound of the distance. Default None. + eps (float): a small value to ensure target < max_dis, instead <=. + Default 0.1. + + Returns: + Tensor: Box transformation deltas. The shape is (N, 5). + """ + + assert points.size(-2) == gt_bboxes.size(-2) + assert points.size(-1) == 2 + assert gt_bboxes.size(-1) == 5 + return self.obb2distance(points, gt_bboxes, max_dis, eps) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..16417b8ab209c57880cfcfe0ba2a955e78c0a3f0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import torch +from mmdet.models.task_modules.coders import \ + DistancePointBBoxCoder as MMDET_DistancePointBBoxCoder +from mmdet.structures.bbox import bbox2distance, distance2bbox + +from mmyolo.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class DistancePointBBoxCoder(MMDET_DistancePointBBoxCoder): + """Distance Point BBox coder. + + This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left, + right) and decode it back to the original. + """ + + def decode( + self, + points: torch.Tensor, + pred_bboxes: torch.Tensor, + stride: torch.Tensor, + max_shape: Optional[Union[Sequence[int], torch.Tensor, + Sequence[Sequence[int]]]] = None + ) -> torch.Tensor: + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2). + pred_bboxes (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). Shape (B, N, 4) + or (N, 4) + stride (Tensor): Featmap stride. + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]], + and the length of max_shape should also be B. + Default None. + Returns: + Tensor: Boxes with shape (N, 4) or (B, N, 4) + """ + assert points.size(-2) == pred_bboxes.size(-2) + assert points.size(-1) == 2 + assert pred_bboxes.size(-1) == 4 + if self.clip_border is False: + max_shape = None + + pred_bboxes = pred_bboxes * stride[None, :, None] + + return distance2bbox(points, pred_bboxes, max_shape) + + def encode(self, + points: torch.Tensor, + gt_bboxes: torch.Tensor, + max_dis: float = 16., + eps: float = 0.01) -> torch.Tensor: + """Encode bounding box to distances. The rewrite is to support batch + operations. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2), The format is [x, y]. + gt_bboxes (Tensor or :obj:`BaseBoxes`): Shape (N, 4), The format + is "xyxy" + max_dis (float): Upper bound of the distance. Default to 16.. + eps (float): a small value to ensure target < max_dis, instead <=. + Default 0.01. + + Returns: + Tensor: Box transformation deltas. The shape is (N, 4) or + (B, N, 4). + """ + + assert points.size(-2) == gt_bboxes.size(-2) + assert points.size(-1) == 2 + assert gt_bboxes.size(-1) == 4 + return bbox2distance(points, gt_bboxes, max_dis, eps) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..bab5f0e0fe06c1930497bdece7c7a06636fe9c37 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from mmdet.models.task_modules.coders.base_bbox_coder import BaseBBoxCoder + +from mmyolo.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class YOLOv5BBoxCoder(BaseBBoxCoder): + """YOLOv5 BBox coder. + + This decoder decodes pred bboxes (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + """ + + def encode(self, **kwargs): + """Encode deltas between bboxes and ground truth boxes.""" + pass + + def decode(self, priors: torch.Tensor, pred_bboxes: torch.Tensor, + stride: Union[torch.Tensor, int]) -> torch.Tensor: + """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + + Args: + priors (torch.Tensor): Basic boxes or points, e.g. anchors. + pred_bboxes (torch.Tensor): Encoded boxes with shape + stride (torch.Tensor | int): Strides of bboxes. + + Returns: + torch.Tensor: Decoded boxes. + """ + assert pred_bboxes.size(-1) == priors.size(-1) == 4 + + pred_bboxes = pred_bboxes.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + # The anchor of mmdet has been offset by 0.5 + x_center_pred = (pred_bboxes[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (pred_bboxes[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (pred_bboxes[..., 2] * 2)**2 * w + h_pred = (pred_bboxes[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + (x_center_pred - w_pred / 2, y_center_pred - h_pred / 2, + x_center_pred + w_pred / 2, y_center_pred + h_pred / 2), + dim=-1) + + return decoded_bboxes diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/yolox_bbox_coder.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/yolox_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..02c898d814e89e5c8ef4db792831a7ba80c7c0cc --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/task_modules/coders/yolox_bbox_coder.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from mmdet.models.task_modules.coders.base_bbox_coder import BaseBBoxCoder + +from mmyolo.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class YOLOXBBoxCoder(BaseBBoxCoder): + """YOLOX BBox coder. + + This decoder decodes pred bboxes (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + """ + + def encode(self, **kwargs): + """Encode deltas between bboxes and ground truth boxes.""" + pass + + def decode(self, priors: torch.Tensor, pred_bboxes: torch.Tensor, + stride: Union[torch.Tensor, int]) -> torch.Tensor: + """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + + Args: + priors (torch.Tensor): Basic boxes or points, e.g. anchors. + pred_bboxes (torch.Tensor): Encoded boxes with shape + stride (torch.Tensor | int): Strides of bboxes. + + Returns: + torch.Tensor: Decoded boxes. + """ + stride = stride[None, :, None] + xys = (pred_bboxes[..., :2] * stride) + priors + whs = pred_bboxes[..., 2:].exp() * stride + + tl_x = (xys[..., 0] - whs[..., 0] / 2) + tl_y = (xys[..., 1] - whs[..., 1] / 2) + br_x = (xys[..., 0] + whs[..., 0] / 2) + br_y = (xys[..., 1] + whs[..., 1] / 2) + + decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) + return decoded_bboxes diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/utils/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d62ff80e25ea5adad8524fd6f756f1db5e4de4d5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/utils/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .misc import (OutputSaveFunctionWrapper, OutputSaveObjectWrapper, + gt_instances_preprocess, make_divisible, make_round) + +__all__ = [ + 'make_divisible', 'make_round', 'gt_instances_preprocess', + 'OutputSaveFunctionWrapper', 'OutputSaveObjectWrapper' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/models/utils/misc.py b/models/YOLO-World/third_party/mmyolo/mmyolo/models/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..96cd1195aefb2fbf5db7535be785dae2fab4add9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/models/utils/misc.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from collections import defaultdict +from copy import deepcopy +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union + +import torch +from mmdet.structures.bbox.transforms import get_box_tensor +from torch import Tensor + + +def make_divisible(x: float, + widen_factor: float = 1.0, + divisor: int = 8) -> int: + """Make sure that x*widen_factor is divisible by divisor.""" + return math.ceil(x * widen_factor / divisor) * divisor + + +def make_round(x: float, deepen_factor: float = 1.0) -> int: + """Make sure that x*deepen_factor becomes an integer not less than 1.""" + return max(round(x * deepen_factor), 1) if x > 1 else x + + +def gt_instances_preprocess(batch_gt_instances: Union[Tensor, Sequence], + batch_size: int) -> Tensor: + """Split batch_gt_instances with batch size. + + From [all_gt_bboxes, box_dim+2] to [batch_size, number_gt, box_dim+1]. + For horizontal box, box_dim=4, for rotated box, box_dim=5 + + If some shape of single batch smaller than + gt bbox len, then using zeros to fill. + + Args: + batch_gt_instances (Sequence[Tensor]): Ground truth + instances for whole batch, shape [all_gt_bboxes, box_dim+2] + batch_size (int): Batch size. + + Returns: + Tensor: batch gt instances data, shape + [batch_size, number_gt, box_dim+1] + """ + if isinstance(batch_gt_instances, Sequence): + max_gt_bbox_len = max( + [len(gt_instances) for gt_instances in batch_gt_instances]) + # fill zeros with length box_dim+1 if some shape of + # single batch not equal max_gt_bbox_len + batch_instance_list = [] + for index, gt_instance in enumerate(batch_gt_instances): + bboxes = gt_instance.bboxes + labels = gt_instance.labels + box_dim = get_box_tensor(bboxes).size(-1) + batch_instance_list.append( + torch.cat((labels[:, None], bboxes), dim=-1)) + + if bboxes.shape[0] >= max_gt_bbox_len: + continue + + fill_tensor = bboxes.new_full( + [max_gt_bbox_len - bboxes.shape[0], box_dim + 1], 0) + batch_instance_list[index] = torch.cat( + (batch_instance_list[index], fill_tensor), dim=0) + + return torch.stack(batch_instance_list) + else: + # faster version + # format of batch_gt_instances: [img_ind, cls_ind, (box)] + # For example horizontal box should be: + # [img_ind, cls_ind, x1, y1, x2, y2] + # Rotated box should be + # [img_ind, cls_ind, x, y, w, h, a] + + # sqlit batch gt instance [all_gt_bboxes, box_dim+2] -> + # [batch_size, max_gt_bbox_len, box_dim+1] + assert isinstance(batch_gt_instances, Tensor) + box_dim = batch_gt_instances.size(-1) - 2 + if len(batch_gt_instances) > 0: + gt_images_indexes = batch_gt_instances[:, 0] + max_gt_bbox_len = gt_images_indexes.unique( + return_counts=True)[1].max() + # fill zeros with length box_dim+1 if some shape of + # single batch not equal max_gt_bbox_len + batch_instance = torch.zeros( + (batch_size, max_gt_bbox_len, box_dim + 1), + dtype=batch_gt_instances.dtype, + device=batch_gt_instances.device) + + for i in range(batch_size): + match_indexes = gt_images_indexes == i + gt_num = match_indexes.sum() + if gt_num: + batch_instance[i, :gt_num] = batch_gt_instances[ + match_indexes, 1:] + else: + batch_instance = torch.zeros((batch_size, 0, box_dim + 1), + dtype=batch_gt_instances.dtype, + device=batch_gt_instances.device) + + return batch_instance + + +class OutputSaveObjectWrapper: + """A wrapper class that saves the output of function calls on an object.""" + + def __init__(self, obj: Any) -> None: + self.obj = obj + self.log = defaultdict(list) + + def __getattr__(self, attr: str) -> Any: + """Overrides the default behavior when an attribute is accessed. + + - If the attribute is callable, hooks the attribute and saves the + returned value of the function call to the log. + - If the attribute is not callable, saves the attribute's value to the + log and returns the value. + """ + orig_attr = getattr(self.obj, attr) + + if not callable(orig_attr): + self.log[attr].append(orig_attr) + return orig_attr + + def hooked(*args: Tuple, **kwargs: Dict) -> Any: + """The hooked function that logs the return value of the original + function.""" + result = orig_attr(*args, **kwargs) + self.log[attr].append(result) + return result + + return hooked + + def clear(self): + """Clears the log of function call outputs.""" + self.log.clear() + + def __deepcopy__(self, memo): + """Only copy the object when applying deepcopy.""" + other = type(self)(deepcopy(self.obj)) + memo[id(self)] = other + return other + + +class OutputSaveFunctionWrapper: + """A class that wraps a function and saves its outputs. + + This class can be used to decorate a function to save its outputs. It wraps + the function with a `__call__` method that calls the original function and + saves the results in a log attribute. + Args: + func (Callable): A function to wrap. + spec (Optional[Dict]): A dictionary of global variables to use as the + namespace for the wrapper. If `None`, the global namespace of the + original function is used. + """ + + def __init__(self, func: Callable, spec: Optional[Dict]) -> None: + """Initializes the OutputSaveFunctionWrapper instance.""" + assert callable(func) + self.log = [] + self.func = func + self.func_name = func.__name__ + + if isinstance(spec, dict): + self.spec = spec + elif hasattr(func, '__globals__'): + self.spec = func.__globals__ + else: + raise ValueError + + def __call__(self, *args, **kwargs) -> Any: + """Calls the wrapped function with the given arguments and saves the + results in the `log` attribute.""" + results = self.func(*args, **kwargs) + self.log.append(results) + return results + + def __enter__(self) -> None: + """Enters the context and sets the wrapped function to be a global + variable in the specified namespace.""" + self.spec[self.func_name] = self + return self.log + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """Exits the context and resets the wrapped function to its original + value in the specified namespace.""" + self.spec[self.func_name] = self.func diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/registry.py b/models/YOLO-World/third_party/mmyolo/mmyolo/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..71f43e6cf53d92917b7aea6175ae0540613ff720 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/registry.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""MMYOLO provides 17 registry nodes to support using modules across projects. +Each node is a child of the root registry in MMEngine. + +More details can be found at +https://mmengine.readthedocs.io/en/latest/tutorials/registry.html. +""" + +from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS +from mmengine.registry import DATASETS as MMENGINE_DATASETS +from mmengine.registry import HOOKS as MMENGINE_HOOKS +from mmengine.registry import LOOPS as MMENGINE_LOOPS +from mmengine.registry import METRICS as MMENGINE_METRICS +from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS +from mmengine.registry import MODELS as MMENGINE_MODELS +from mmengine.registry import \ + OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS +from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS +from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS +from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS +from mmengine.registry import \ + RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS +from mmengine.registry import RUNNERS as MMENGINE_RUNNERS +from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS +from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS +from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS +from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS +from mmengine.registry import \ + WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS +from mmengine.registry import Registry + +# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner` +RUNNERS = Registry( + 'runner', parent=MMENGINE_RUNNERS, locations=['mmyolo.engine']) +# manage runner constructors that define how to initialize runners +RUNNER_CONSTRUCTORS = Registry( + 'runner constructor', + parent=MMENGINE_RUNNER_CONSTRUCTORS, + locations=['mmyolo.engine']) +# manage all kinds of loops like `EpochBasedTrainLoop` +LOOPS = Registry('loop', parent=MMENGINE_LOOPS, locations=['mmyolo.engine']) +# manage all kinds of hooks like `CheckpointHook` +HOOKS = Registry( + 'hook', parent=MMENGINE_HOOKS, locations=['mmyolo.engine.hooks']) + +# manage data-related modules +DATASETS = Registry( + 'dataset', parent=MMENGINE_DATASETS, locations=['mmyolo.datasets']) +DATA_SAMPLERS = Registry( + 'data sampler', + parent=MMENGINE_DATA_SAMPLERS, + locations=['mmyolo.datasets']) +TRANSFORMS = Registry( + 'transform', + parent=MMENGINE_TRANSFORMS, + locations=['mmyolo.datasets.transforms']) + +# manage all kinds of modules inheriting `nn.Module` +MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmyolo.models']) +# manage all kinds of model wrappers like 'MMDistributedDataParallel' +MODEL_WRAPPERS = Registry( + 'model_wrapper', + parent=MMENGINE_MODEL_WRAPPERS, + locations=['mmyolo.models']) +# manage all kinds of weight initialization modules like `Uniform` +WEIGHT_INITIALIZERS = Registry( + 'weight initializer', + parent=MMENGINE_WEIGHT_INITIALIZERS, + locations=['mmyolo.models']) + +# manage all kinds of optimizers like `SGD` and `Adam` +OPTIMIZERS = Registry( + 'optimizer', + parent=MMENGINE_OPTIMIZERS, + locations=['mmyolo.engine.optimizers']) +OPTIM_WRAPPERS = Registry( + 'optim_wrapper', + parent=MMENGINE_OPTIM_WRAPPERS, + locations=['mmyolo.engine.optimizers']) +# manage constructors that customize the optimization hyperparameters. +OPTIM_WRAPPER_CONSTRUCTORS = Registry( + 'optimizer constructor', + parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS, + locations=['mmyolo.engine.optimizers']) +# manage all kinds of parameter schedulers like `MultiStepLR` +PARAM_SCHEDULERS = Registry( + 'parameter scheduler', + parent=MMENGINE_PARAM_SCHEDULERS, + locations=['mmyolo.engine.optimizers']) +# manage all kinds of metrics +METRICS = Registry( + 'metric', parent=MMENGINE_METRICS, locations=['mmyolo.engine']) + +# manage task-specific modules like anchor generators and box coders +TASK_UTILS = Registry( + 'task util', parent=MMENGINE_TASK_UTILS, locations=['mmyolo.models']) + +# manage visualizer +VISUALIZERS = Registry( + 'visualizer', parent=MMENGINE_VISUALIZERS, locations=['mmyolo.utils']) +# manage visualizer backend +VISBACKENDS = Registry( + 'vis_backend', parent=MMENGINE_VISBACKENDS, locations=['mmyolo.utils']) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/testing/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/testing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b6d7a010ee27b2822d44ad099f46f65bf6f0c00a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/testing/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ._utils import get_detector_cfg + +__all__ = ['get_detector_cfg'] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/testing/_utils.py b/models/YOLO-World/third_party/mmyolo/mmyolo/testing/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9ccf2fe0cfd7baa3aeb7f3793c3db025d8889d5f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/testing/_utils.py @@ -0,0 +1,53 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from os.path import dirname, exists, join + +import numpy as np +from mmengine.config import Config + + +def _get_config_directory(): + """Find the predefined detector config directory.""" + try: + # Assume we are running in the source mmyolo repo + repo_dpath = dirname(dirname(dirname(__file__))) + except NameError: + # For IPython development when this __file__ is not defined + import mmyolo + repo_dpath = dirname(dirname(mmyolo.__file__)) + config_dpath = join(repo_dpath, 'configs') + if not exists(config_dpath): + raise Exception('Cannot find config path') + return config_dpath + + +def _get_config_module(fname): + """Load a configuration as a python module.""" + config_dpath = _get_config_directory() + config_fpath = join(config_dpath, fname) + config_mod = Config.fromfile(config_fpath) + return config_mod + + +def get_detector_cfg(fname): + """Grab configs necessary to create a detector. + + These are deep copied to allow for safe modification of parameters without + influencing other tests. + """ + config = _get_config_module(fname) + model = copy.deepcopy(config.model) + return model + + +def _rand_bboxes(rng, num_boxes, w, h): + """Randomly generate a specified number of bboxes.""" + cx, cy, bw, bh = rng.rand(num_boxes, 4).T + + tl_x = ((cx * w) - (w * bw / 2)).clip(0, w) + tl_y = ((cy * h) - (h * bh / 2)).clip(0, h) + br_x = ((cx * w) + (w * bw / 2)).clip(0, w) + br_y = ((cy * h) + (h * bh / 2)).clip(0, h) + + bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T + return bboxes diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/utils/__init__.py b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f4e968494892ccefb60d0c7b713c131ddc6fb869 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .collect_env import collect_env +from .misc import is_metainfo_lower, switch_to_deploy +from .setup_env import register_all_modules + +__all__ = [ + 'register_all_modules', 'collect_env', 'switch_to_deploy', + 'is_metainfo_lower' +] diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/utils/boxam_utils.py b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/boxam_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..50d6c09ecd309abe11777b4bc5307db0bbec2735 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/boxam_utils.py @@ -0,0 +1,517 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import bisect +import copy +import warnings +from pathlib import Path +from typing import Callable, List, Optional, Tuple, Union + +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torchvision +from mmcv.transforms import Compose +from mmdet.evaluation import get_classes +from mmdet.utils import ConfigType +from mmengine.config import Config +from mmengine.registry import init_default_scope +from mmengine.runner import load_checkpoint +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS + +try: + from pytorch_grad_cam import (AblationCAM, AblationLayer, + ActivationsAndGradients) + from pytorch_grad_cam import GradCAM as Base_GradCAM + from pytorch_grad_cam import GradCAMPlusPlus as Base_GradCAMPlusPlus + from pytorch_grad_cam.base_cam import BaseCAM + from pytorch_grad_cam.utils.image import scale_cam_image, show_cam_on_image + from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection +except ImportError: + pass + + +def init_detector( + config: Union[str, Path, Config], + checkpoint: Optional[str] = None, + palette: str = 'coco', + device: str = 'cuda:0', + cfg_options: Optional[dict] = None, +) -> nn.Module: + """Initialize a detector from config file. + + Args: + config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path, + :obj:`Path`, or the config object. + checkpoint (str, optional): Checkpoint path. If left as None, the model + will not load any weights. + palette (str): Color palette used for visualization. If palette + is stored in checkpoint, use checkpoint's palette first, otherwise + use externally passed palette. Currently, supports 'coco', 'voc', + 'citys' and 'random'. Defaults to coco. + device (str): The device where the anchors will be put on. + Defaults to cuda:0. + cfg_options (dict, optional): Options to override some settings in + the used config. + + Returns: + nn.Module: The constructed detector. + """ + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if cfg_options is not None: + config.merge_from_dict(cfg_options) + elif 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + + # only change this + # grad based method requires train_cfg + # config.model.train_cfg = None + init_default_scope(config.get('default_scope', 'mmyolo')) + + model = MODELS.build(config.model) + if checkpoint is not None: + checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') + # Weights converted from elsewhere may not have meta fields. + checkpoint_meta = checkpoint.get('meta', {}) + # save the dataset_meta in the model for convenience + if 'dataset_meta' in checkpoint_meta: + # mmdet 3.x, all keys should be lowercase + model.dataset_meta = { + k.lower(): v + for k, v in checkpoint_meta['dataset_meta'].items() + } + elif 'CLASSES' in checkpoint_meta: + # < mmdet 3.x + classes = checkpoint_meta['CLASSES'] + model.dataset_meta = {'classes': classes, 'palette': palette} + else: + warnings.simplefilter('once') + warnings.warn( + 'dataset_meta or class names are not saved in the ' + 'checkpoint\'s meta data, use COCO classes by default.') + model.dataset_meta = { + 'classes': get_classes('coco'), + 'palette': palette + } + + model.cfg = config # save the config in the model for convenience + model.to(device) + model.eval() + return model + + +def reshape_transform(feats: Union[Tensor, List[Tensor]], + max_shape: Tuple[int, int] = (20, 20), + is_need_grad: bool = False): + """Reshape and aggregate feature maps when the input is a multi-layer + feature map. + + Takes these tensors with different sizes, resizes them to a common shape, + and concatenates them. + """ + if len(max_shape) == 1: + max_shape = max_shape * 2 + + if isinstance(feats, torch.Tensor): + feats = [feats] + else: + if is_need_grad: + raise NotImplementedError('The `grad_base` method does not ' + 'support output multi-activation layers') + + max_h = max([im.shape[-2] for im in feats]) + max_w = max([im.shape[-1] for im in feats]) + if -1 in max_shape: + max_shape = (max_h, max_w) + else: + max_shape = (min(max_h, max_shape[0]), min(max_w, max_shape[1])) + + activations = [] + for feat in feats: + activations.append( + torch.nn.functional.interpolate( + torch.abs(feat), max_shape, mode='bilinear')) + + activations = torch.cat(activations, axis=1) + return activations + + +class BoxAMDetectorWrapper(nn.Module): + """Wrap the mmdet model class to facilitate handling of non-tensor + situations during inference.""" + + def __init__(self, + cfg: ConfigType, + checkpoint: str, + score_thr: float, + device: str = 'cuda:0'): + super().__init__() + self.cfg = cfg + self.device = device + self.score_thr = score_thr + self.checkpoint = checkpoint + self.detector = init_detector(self.cfg, self.checkpoint, device=device) + + pipeline_cfg = copy.deepcopy(self.cfg.test_dataloader.dataset.pipeline) + pipeline_cfg[0].type = 'mmdet.LoadImageFromNDArray' + + new_test_pipeline = [] + for pipeline in pipeline_cfg: + if not pipeline['type'].endswith('LoadAnnotations'): + new_test_pipeline.append(pipeline) + self.test_pipeline = Compose(new_test_pipeline) + + self.is_need_loss = False + self.input_data = None + self.image = None + + def need_loss(self, is_need_loss: bool): + """Grad-based methods require loss.""" + self.is_need_loss = is_need_loss + + def set_input_data(self, + image: np.ndarray, + pred_instances: Optional[InstanceData] = None): + """Set the input data to be used in the next step.""" + self.image = image + + if self.is_need_loss: + assert pred_instances is not None + pred_instances = pred_instances.numpy() + data = dict( + img=self.image, + img_id=0, + gt_bboxes=pred_instances.bboxes, + gt_bboxes_labels=pred_instances.labels) + data = self.test_pipeline(data) + else: + data = dict(img=self.image, img_id=0) + data = self.test_pipeline(data) + data['inputs'] = [data['inputs']] + data['data_samples'] = [data['data_samples']] + self.input_data = data + + def __call__(self, *args, **kwargs): + assert self.input_data is not None + if self.is_need_loss: + # Maybe this is a direction that can be optimized + # self.detector.init_weights() + if hasattr(self.detector.bbox_head, 'head_module'): + self.detector.bbox_head.head_module.training = True + else: + self.detector.bbox_head.training = True + if hasattr(self.detector.bbox_head, 'featmap_sizes'): + # Prevent the model algorithm error when calculating loss + self.detector.bbox_head.featmap_sizes = None + + data_ = {} + data_['inputs'] = [self.input_data['inputs']] + data_['data_samples'] = [self.input_data['data_samples']] + data = self.detector.data_preprocessor(data_, training=False) + loss = self.detector._run_forward(data, mode='loss') + + if hasattr(self.detector.bbox_head, 'featmap_sizes'): + self.detector.bbox_head.featmap_sizes = None + + return [loss] + else: + if hasattr(self.detector.bbox_head, 'head_module'): + self.detector.bbox_head.head_module.training = False + else: + self.detector.bbox_head.training = False + with torch.no_grad(): + results = self.detector.test_step(self.input_data) + return results + + +class BoxAMDetectorVisualizer: + """Box AM visualization class.""" + + def __init__(self, + method_class, + model: nn.Module, + target_layers: List, + reshape_transform: Optional[Callable] = None, + is_need_grad: bool = False, + extra_params: Optional[dict] = None): + self.target_layers = target_layers + self.reshape_transform = reshape_transform + self.is_need_grad = is_need_grad + + if method_class.__name__ == 'AblationCAM': + batch_size = extra_params.get('batch_size', 1) + ratio_channels_to_ablate = extra_params.get( + 'ratio_channels_to_ablate', 1.) + self.cam = AblationCAM( + model, + target_layers, + use_cuda=True if 'cuda' in model.device else False, + reshape_transform=reshape_transform, + batch_size=batch_size, + ablation_layer=extra_params['ablation_layer'], + ratio_channels_to_ablate=ratio_channels_to_ablate) + else: + self.cam = method_class( + model, + target_layers, + use_cuda=True if 'cuda' in model.device else False, + reshape_transform=reshape_transform, + ) + if self.is_need_grad: + self.cam.activations_and_grads.release() + + self.classes = model.detector.dataset_meta['classes'] + self.COLORS = np.random.uniform(0, 255, size=(len(self.classes), 3)) + + def switch_activations_and_grads(self, model) -> None: + """In the grad-based method, we need to switch + ``ActivationsAndGradients`` layer, otherwise an error will occur.""" + self.cam.model = model + + if self.is_need_grad is True: + self.cam.activations_and_grads = ActivationsAndGradients( + model, self.target_layers, self.reshape_transform) + self.is_need_grad = False + else: + self.cam.activations_and_grads.release() + self.is_need_grad = True + + def __call__(self, img, targets, aug_smooth=False, eigen_smooth=False): + img = torch.from_numpy(img)[None].permute(0, 3, 1, 2) + return self.cam(img, targets, aug_smooth, eigen_smooth)[0, :] + + def show_am(self, + image: np.ndarray, + pred_instance: InstanceData, + grayscale_am: np.ndarray, + with_norm_in_bboxes: bool = False): + """Normalize the AM to be in the range [0, 1] inside every bounding + boxes, and zero outside of the bounding boxes.""" + + boxes = pred_instance.bboxes + labels = pred_instance.labels + + if with_norm_in_bboxes is True: + boxes = boxes.astype(np.int32) + renormalized_am = np.zeros(grayscale_am.shape, dtype=np.float32) + images = [] + for x1, y1, x2, y2 in boxes: + img = renormalized_am * 0 + img[y1:y2, x1:x2] = scale_cam_image( + [grayscale_am[y1:y2, x1:x2].copy()])[0] + images.append(img) + + renormalized_am = np.max(np.float32(images), axis=0) + renormalized_am = scale_cam_image([renormalized_am])[0] + else: + renormalized_am = grayscale_am + + am_image_renormalized = show_cam_on_image( + image / 255, renormalized_am, use_rgb=False) + + image_with_bounding_boxes = self._draw_boxes( + boxes, labels, am_image_renormalized, pred_instance.get('scores')) + return image_with_bounding_boxes + + def _draw_boxes(self, + boxes: List, + labels: List, + image: np.ndarray, + scores: Optional[List] = None): + """draw boxes on image.""" + for i, box in enumerate(boxes): + label = labels[i] + color = self.COLORS[label] + cv2.rectangle(image, (int(box[0]), int(box[1])), + (int(box[2]), int(box[3])), color, 2) + if scores is not None: + score = scores[i] + text = str(self.classes[label]) + ': ' + str( + round(score * 100, 1)) + else: + text = self.classes[label] + + cv2.putText( + image, + text, (int(box[0]), int(box[1] - 5)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + color, + 1, + lineType=cv2.LINE_AA) + return image + + +class DetAblationLayer(AblationLayer): + """Det AblationLayer.""" + + def __init__(self): + super().__init__() + self.activations = None + + def set_next_batch(self, input_batch_index, activations, + num_channels_to_ablate): + """Extract the next batch member from activations, and repeat it + num_channels_to_ablate times.""" + if isinstance(activations, torch.Tensor): + return super().set_next_batch(input_batch_index, activations, + num_channels_to_ablate) + + self.activations = [] + for activation in activations: + activation = activation[ + input_batch_index, :, :, :].clone().unsqueeze(0) + self.activations.append( + activation.repeat(num_channels_to_ablate, 1, 1, 1)) + + def __call__(self, x): + """Go over the activation indices to be ablated, stored in + self.indices.""" + result = self.activations + + if isinstance(result, torch.Tensor): + return super().__call__(x) + + channel_cumsum = np.cumsum([r.shape[1] for r in result]) + num_channels_to_ablate = result[0].size(0) # batch + for i in range(num_channels_to_ablate): + pyramid_layer = bisect.bisect_right(channel_cumsum, + self.indices[i]) + if pyramid_layer > 0: + index_in_pyramid_layer = self.indices[i] - channel_cumsum[ + pyramid_layer - 1] + else: + index_in_pyramid_layer = self.indices[i] + result[pyramid_layer][i, index_in_pyramid_layer, :, :] = -1000 + return result + + +class DetBoxScoreTarget: + """Det Score calculation class. + + In the case of the grad-free method, the calculation method is that + for every original detected bounding box specified in "bboxes", + assign a score on how the current bounding boxes match it, + + 1. In Bbox IoU + 2. In the classification score. + 3. In Mask IoU if ``segms`` exist. + + If there is not a large enough overlap, or the category changed, + assign a score of 0. The total score is the sum of all the box scores. + + In the case of the grad-based method, the calculation method is + the sum of losses after excluding a specific key. + """ + + def __init__(self, + pred_instance: InstanceData, + match_iou_thr: float = 0.5, + device: str = 'cuda:0', + ignore_loss_params: Optional[List] = None): + self.focal_bboxes = pred_instance.bboxes + self.focal_labels = pred_instance.labels + self.match_iou_thr = match_iou_thr + self.device = device + self.ignore_loss_params = ignore_loss_params + if ignore_loss_params is not None: + assert isinstance(self.ignore_loss_params, list) + + def __call__(self, results): + output = torch.tensor([0.], device=self.device) + + if 'loss_cls' in results: + # grad-based method + # results is dict + for loss_key, loss_value in results.items(): + if 'loss' not in loss_key or \ + loss_key in self.ignore_loss_params: + continue + if isinstance(loss_value, list): + output += sum(loss_value) + else: + output += loss_value + return output + else: + # grad-free method + # results is DetDataSample + pred_instances = results.pred_instances + if len(pred_instances) == 0: + return output + + pred_bboxes = pred_instances.bboxes + pred_scores = pred_instances.scores + pred_labels = pred_instances.labels + + for focal_box, focal_label in zip(self.focal_bboxes, + self.focal_labels): + ious = torchvision.ops.box_iou(focal_box[None], + pred_bboxes[..., :4]) + index = ious.argmax() + if ious[0, index] > self.match_iou_thr and pred_labels[ + index] == focal_label: + # TODO: Adaptive adjustment of weights based on algorithms + score = ious[0, index] + pred_scores[index] + output = output + score + return output + + +class SpatialBaseCAM(BaseCAM): + """CAM that maintains spatial information. + + Gradients are often averaged over the spatial dimension in CAM + visualization for classification, but this is unreasonable in detection + tasks. There is no need to average the gradients in the detection task. + """ + + def get_cam_image(self, + input_tensor: torch.Tensor, + target_layer: torch.nn.Module, + targets: List[torch.nn.Module], + activations: torch.Tensor, + grads: torch.Tensor, + eigen_smooth: bool = False) -> np.ndarray: + + weights = self.get_cam_weights(input_tensor, target_layer, targets, + activations, grads) + weighted_activations = weights * activations + if eigen_smooth: + cam = get_2d_projection(weighted_activations) + else: + cam = weighted_activations.sum(axis=1) + return cam + + +class GradCAM(SpatialBaseCAM, Base_GradCAM): + """Gradients are no longer averaged over the spatial dimension.""" + + def get_cam_weights(self, input_tensor, target_layer, target_category, + activations, grads): + return grads + + +class GradCAMPlusPlus(SpatialBaseCAM, Base_GradCAMPlusPlus): + """Gradients are no longer averaged over the spatial dimension.""" + + def get_cam_weights(self, input_tensor, target_layers, target_category, + activations, grads): + grads_power_2 = grads**2 + grads_power_3 = grads_power_2 * grads + # Equation 19 in https://arxiv.org/abs/1710.11063 + sum_activations = np.sum(activations, axis=(2, 3)) + eps = 0.000001 + aij = grads_power_2 / ( + 2 * grads_power_2 + + sum_activations[:, :, None, None] * grads_power_3 + eps) + # Now bring back the ReLU from eq.7 in the paper, + # And zero out aijs where the activations are 0 + aij = np.where(grads != 0, aij, 0) + + weights = np.maximum(grads, 0) * aij + return weights diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/utils/collect_env.py b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..89bad658cb7d4f1b602690d8d888a309166283ee --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/collect_env.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import mmdet +from mmengine.utils import get_git_hash +from mmengine.utils.dl_utils import collect_env as collect_base_env + +import mmyolo + + +def collect_env() -> dict: + """Collect the information of the running environments.""" + env_info = collect_base_env() + env_info['MMCV'] = mmcv.__version__ + env_info['MMDetection'] = mmdet.__version__ + env_info['MMYOLO'] = mmyolo.__version__ + '+' + get_git_hash()[:7] + return env_info + + +if __name__ == '__main__': + for name, val in collect_env().items(): + print(f'{name}: {val}') diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/utils/labelme_utils.py b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/labelme_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0981919771a617ca79b29c3ddf96ea14c82fccc6 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/labelme_utils.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path + +from mmengine.structures import InstanceData + + +class LabelmeFormat: + """Predict results save into labelme file. + + Base on https://github.com/wkentaro/labelme/blob/main/labelme/label_file.py + + Args: + classes (tuple): Model classes name. + """ + + def __init__(self, classes: tuple): + super().__init__() + self.classes = classes + + def __call__(self, pred_instances: InstanceData, metainfo: dict, + output_path: str, selected_classes: list): + """Get image data field for labelme. + + Args: + pred_instances (InstanceData): Candidate prediction info. + metainfo (dict): Meta info of prediction. + output_path (str): Image file path. + selected_classes (list): Selected class name. + + Labelme file eg. + { + "version": "5.1.1", + "flags": {}, + "imagePath": "/data/cat/1.jpg", + "imageData": null, + "imageHeight": 3000, + "imageWidth": 4000, + "shapes": [ + { + "label": "cat", + "points": [ + [ + 1148.076923076923, + 1188.4615384615383 + ], + [ + 2471.1538461538457, + 2176.923076923077 + ] + ], + "group_id": null, + "shape_type": "rectangle", + "flags": {} + }, + {...} + ] + } + """ + + image_path = os.path.abspath(metainfo['img_path']) + + json_info = { + 'version': '5.1.1', + 'flags': {}, + 'imagePath': image_path, + 'imageData': None, + 'imageHeight': metainfo['ori_shape'][0], + 'imageWidth': metainfo['ori_shape'][1], + 'shapes': [] + } + + for pred_instance in pred_instances: + pred_bbox = pred_instance.bboxes.cpu().numpy().tolist()[0] + pred_label = self.classes[pred_instance.labels] + + if selected_classes is not None and \ + pred_label not in selected_classes: + # filter class name + continue + + sub_dict = { + 'label': pred_label, + 'points': [pred_bbox[:2], pred_bbox[2:]], + 'group_id': None, + 'shape_type': 'rectangle', + 'flags': {} + } + json_info['shapes'].append(sub_dict) + + with open(output_path, 'w', encoding='utf-8') as f_json: + json.dump(json_info, f_json, ensure_ascii=False, indent=2) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/utils/large_image.py b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/large_image.py new file mode 100644 index 0000000000000000000000000000000000000000..8670804684f6dcdc6dc1846cf85260d900b3474e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/large_image.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Tuple + +import torch +from mmcv.ops import batched_nms +from mmdet.structures import DetDataSample, SampleList +from mmengine.structures import InstanceData + + +def shift_rbboxes(bboxes: torch.Tensor, offset: Sequence[int]): + """Shift rotated bboxes with offset. + + Args: + bboxes (Tensor): The rotated bboxes need to be translated. + With shape (n, 5), which means (x, y, w, h, a). + offset (Sequence[int]): The translation offsets with shape of (2, ). + Returns: + Tensor: Shifted rotated bboxes. + """ + offset_tensor = bboxes.new_tensor(offset) + shifted_bboxes = bboxes.clone() + shifted_bboxes[:, 0:2] = shifted_bboxes[:, 0:2] + offset_tensor + return shifted_bboxes + + +def shift_predictions(det_data_samples: SampleList, + offsets: Sequence[Tuple[int, int]], + src_image_shape: Tuple[int, int]) -> SampleList: + """Shift predictions to the original image. + + Args: + det_data_samples (List[:obj:`DetDataSample`]): A list of patch results. + offsets (Sequence[Tuple[int, int]]): Positions of the left top points + of patches. + src_image_shape (Tuple[int, int]): A (height, width) tuple of the large + image's width and height. + Returns: + (List[:obj:`DetDataSample`]): shifted results. + """ + try: + from sahi.slicing import shift_bboxes, shift_masks + except ImportError: + raise ImportError('Please run "pip install -U sahi" ' + 'to install sahi first for large image inference.') + + assert len(det_data_samples) == len( + offsets), 'The `results` should has the ' 'same length with `offsets`.' + shifted_predictions = [] + for det_data_sample, offset in zip(det_data_samples, offsets): + pred_inst = det_data_sample.pred_instances.clone() + + # Check bbox type + if pred_inst.bboxes.size(-1) == 4: + # Horizontal bboxes + shifted_bboxes = shift_bboxes(pred_inst.bboxes, offset) + elif pred_inst.bboxes.size(-1) == 5: + # Rotated bboxes + shifted_bboxes = shift_rbboxes(pred_inst.bboxes, offset) + else: + raise NotImplementedError + + # shift bboxes and masks + pred_inst.bboxes = shifted_bboxes + if 'masks' in det_data_sample: + pred_inst.masks = shift_masks(pred_inst.masks, offset, + src_image_shape) + + shifted_predictions.append(pred_inst.clone()) + + shifted_predictions = InstanceData.cat(shifted_predictions) + + return shifted_predictions + + +def merge_results_by_nms(results: SampleList, offsets: Sequence[Tuple[int, + int]], + src_image_shape: Tuple[int, int], + nms_cfg: dict) -> DetDataSample: + """Merge patch results by nms. + + Args: + results (List[:obj:`DetDataSample`]): A list of patch results. + offsets (Sequence[Tuple[int, int]]): Positions of the left top points + of patches. + src_image_shape (Tuple[int, int]): A (height, width) tuple of the large + image's width and height. + nms_cfg (dict): it should specify nms type and other parameters + like `iou_threshold`. + Returns: + :obj:`DetDataSample`: merged results. + """ + shifted_instances = shift_predictions(results, offsets, src_image_shape) + + _, keeps = batched_nms( + boxes=shifted_instances.bboxes, + scores=shifted_instances.scores, + idxs=shifted_instances.labels, + nms_cfg=nms_cfg) + merged_instances = shifted_instances[keeps] + + merged_result = results[0].clone() + merged_result.pred_instances = merged_instances + return merged_result diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/utils/misc.py b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..f5d366d75d4821753354c54629d2929661993578 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/misc.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import urllib + +import numpy as np +import torch +from mmengine.utils import scandir +from prettytable import PrettyTable + +from mmyolo.models import RepVGGBlock + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def switch_to_deploy(model): + """Model switch to deploy status.""" + for layer in model.modules(): + if isinstance(layer, RepVGGBlock): + layer.switch_to_deploy() + + print('Switch model to deploy modality.') + + +def auto_arrange_images(image_list: list, image_column: int = 2) -> np.ndarray: + """Auto arrange image to image_column x N row. + + Args: + image_list (list): cv2 image list. + image_column (int): Arrange to N column. Default: 2. + Return: + (np.ndarray): image_column x N row merge image + """ + img_count = len(image_list) + if img_count <= image_column: + # no need to arrange + image_show = np.concatenate(image_list, axis=1) + else: + # arrange image according to image_column + image_row = round(img_count / image_column) + fill_img_list = [np.ones(image_list[0].shape, dtype=np.uint8) * 255 + ] * ( + image_row * image_column - img_count) + image_list.extend(fill_img_list) + merge_imgs_col = [] + for i in range(image_row): + start_col = image_column * i + end_col = image_column * (i + 1) + merge_col = np.hstack(image_list[start_col:end_col]) + merge_imgs_col.append(merge_col) + + # merge to one image + image_show = np.vstack(merge_imgs_col) + + return image_show + + +def get_file_list(source_root: str) -> [list, dict]: + """Get file list. + + Args: + source_root (str): image or video source path + + Return: + source_file_path_list (list): A list for all source file. + source_type (dict): Source type: file or url or dir. + """ + is_dir = os.path.isdir(source_root) + is_url = source_root.startswith(('http:/', 'https:/')) + is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS + + source_file_path_list = [] + if is_dir: + # when input source is dir + for file in scandir( + source_root, IMG_EXTENSIONS, recursive=True, + case_sensitive=False): + source_file_path_list.append(os.path.join(source_root, file)) + elif is_url: + # when input source is url + filename = os.path.basename( + urllib.parse.unquote(source_root).split('?')[0]) + file_save_path = os.path.join(os.getcwd(), filename) + print(f'Downloading source file to {file_save_path}') + torch.hub.download_url_to_file(source_root, file_save_path) + source_file_path_list = [file_save_path] + elif is_file: + # when input source is single image + source_file_path_list = [source_root] + else: + print('Cannot find image file.') + + source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file) + + return source_file_path_list, source_type + + +def show_data_classes(data_classes): + """When printing an error, all class names of the dataset.""" + print('\n\nThe name of the class contained in the dataset:') + data_classes_info = PrettyTable() + data_classes_info.title = 'Information of dataset class' + # List Print Settings + # If the quantity is too large, 25 rows will be displayed in each column + if len(data_classes) < 25: + data_classes_info.add_column('Class name', data_classes) + elif len(data_classes) % 25 != 0 and len(data_classes) > 25: + col_num = int(len(data_classes) / 25) + 1 + data_name_list = list(data_classes) + for i in range(0, (col_num * 25) - len(data_classes)): + data_name_list.append('') + for i in range(0, len(data_name_list), 25): + data_classes_info.add_column('Class name', + data_name_list[i:i + 25]) + + # Align display data to the left + data_classes_info.align['Class name'] = 'l' + print(data_classes_info) + + +def is_metainfo_lower(cfg): + """Determine whether the custom metainfo fields are all lowercase.""" + + def judge_keys(dataloader_cfg): + while 'dataset' in dataloader_cfg: + dataloader_cfg = dataloader_cfg['dataset'] + if 'metainfo' in dataloader_cfg: + all_keys = dataloader_cfg['metainfo'].keys() + all_is_lower = all([str(k).islower() for k in all_keys]) + assert all_is_lower, f'The keys in dataset metainfo must be all lowercase, but got {all_keys}. ' \ + f'Please refer to https://github.com/open-mmlab/mmyolo/blob/e62c8c4593/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py#L8' # noqa + + judge_keys(cfg.get('train_dataloader', {})) + judge_keys(cfg.get('val_dataloader', {})) + judge_keys(cfg.get('test_dataloader', {})) diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/utils/setup_env.py b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/setup_env.py new file mode 100644 index 0000000000000000000000000000000000000000..f51ed928cbddb98c2274e09b5acea1d70dfd1abd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/utils/setup_env.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import warnings + +from mmengine import DefaultScope + + +def register_all_modules(init_default_scope: bool = True): + """Register all modules in mmdet into the registries. + + Args: + init_default_scope (bool): Whether initialize the mmdet default scope. + When `init_default_scope=True`, the global default scope will be + set to `mmyolo`, and all registries will build modules from mmdet's + registry node. To understand more about the registry, please refer + to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md + Defaults to True. + """ # noqa + import mmdet.engine # noqa: F401,F403 + import mmdet.visualization # noqa: F401,F403 + + import mmyolo.datasets # noqa: F401,F403 + import mmyolo.engine # noqa: F401,F403 + import mmyolo.models # noqa: F401,F403 + + if init_default_scope: + never_created = DefaultScope.get_current_instance() is None \ + or not DefaultScope.check_instance_created('mmyolo') + if never_created: + DefaultScope.get_instance('mmyolo', scope_name='mmyolo') + return + current_scope = DefaultScope.get_current_instance() + if current_scope.scope_name != 'mmyolo': + warnings.warn('The current default scope ' + f'"{current_scope.scope_name}" is not "mmyolo", ' + '`register_all_modules` will force the current' + 'default scope to be "mmyolo". If this is not ' + 'expected, please set `init_default_scope=False`.') + # avoid name conflict + new_instance_name = f'mmyolo-{datetime.datetime.now()}' + DefaultScope.get_instance(new_instance_name, scope_name='mmyolo') diff --git a/models/YOLO-World/third_party/mmyolo/mmyolo/version.py b/models/YOLO-World/third_party/mmyolo/mmyolo/version.py new file mode 100644 index 0000000000000000000000000000000000000000..6e4f0e8e3747eeb71d72d53d0e2daf6ea203c596 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/mmyolo/version.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +__version__ = '0.6.0' + +from typing import Tuple + +short_version = __version__ + + +def parse_version_info(version_str: str) -> Tuple: + """Parse version info of MMYOLO.""" + version_info = [] + for x in version_str.split('.'): + if x.isdigit(): + version_info.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + version_info.append(int(patch_version[0])) + version_info.append(f'rc{patch_version[1]}') + return tuple(version_info) + + +version_info = parse_version_info(__version__) diff --git a/models/YOLO-World/third_party/mmyolo/model-index.yml b/models/YOLO-World/third_party/mmyolo/model-index.yml new file mode 100644 index 0000000000000000000000000000000000000000..9aa0288cc4b79f5b367c159ad9b29ccd62a0b74c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/model-index.yml @@ -0,0 +1,8 @@ +Import: + - configs/yolov5/metafile.yml + - configs/yolov6/metafile.yml + - configs/yolox/metafile.yml + - configs/rtmdet/metafile.yml + - configs/yolov7/metafile.yml + - configs/ppyoloe/metafile.yml + - configs/yolov8/metafile.yml diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/README.md b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..918589f228af70f3338b2e6ea065ea72f245ebc1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/README.md @@ -0,0 +1,43 @@ +# MMYOLO Model Assigner Visualization + + + +## Introduction + +This project is developed for easily showing assigning results. The script allows users to analyze where and how many positive samples each gt is assigned in the image. + +Now, the script supports `YOLOv5`, `YOLOv7`, `YOLOv8` and `RTMDet`. + +## Usage + +### Command + +YOLOv5 assigner visualization command: + +```shell +python projects/assigner_visualization/assigner_visualization.py projects/assigner_visualization/configs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py +``` + +Note: `YOLOv5` does not need to load the trained weights. + +YOLOv7 assigner visualization command: + +```shell +python projects/assigner_visualization/assigner_visualization.py projects/assigner_visualization/configs/yolov7_tiny_syncbn_fast_8xb16-300e_coco_assignervisualization.py -c ${checkpont} +``` + +YOLOv8 assigner visualization command: + +```shell +python projects/assigner_visualization/assigner_visualization.py projects/assigner_visualization/configs/yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py -c ${checkpont} +``` + +RTMdet assigner visualization command: + +```shell +python projects/assigner_visualization/assigner_visualization.py projects/assigner_visualization/configs/rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py -c ${checkpont} +``` + +${checkpont} is the checkpont file path. Dynamic label assignment is used in `YOLOv7`, `YOLOv8` and `RTMDet`, model weights will affect the positive sample allocation results, so it is recommended to load the trained model weights. + +If you want to know details about label assignment, you can check the [RTMDet](https://mmyolo.readthedocs.io/zh_CN/latest/algorithm_descriptions/rtmdet_description.html#id5). diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/assigner_visualization.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/assigner_visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..e290d26b6d6fbb2f703faf3ebcd0474da871aea8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/assigner_visualization.py @@ -0,0 +1,177 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp +import sys +import warnings + +import mmcv +import numpy as np +import torch +from mmengine import ProgressBar +from mmengine.config import Config, DictAction +from mmengine.dataset import COLLATE_FUNCTIONS +from mmengine.runner.checkpoint import load_checkpoint +from numpy import random + +from mmyolo.registry import DATASETS, MODELS +from mmyolo.utils import register_all_modules +from projects.assigner_visualization.dense_heads import (RTMHeadAssigner, + YOLOv5HeadAssigner, + YOLOv7HeadAssigner, + YOLOv8HeadAssigner) +from projects.assigner_visualization.visualization import \ + YOLOAssignerVisualizer + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMYOLO show the positive sample assigning' + ' results.') + parser.add_argument('config', help='config file path') + parser.add_argument('--checkpoint', '-c', type=str, help='checkpoint file') + parser.add_argument( + '--show-number', + '-n', + type=int, + default=sys.maxsize, + help='number of images selected to save, ' + 'must bigger than 0. if the number is bigger than length ' + 'of dataset, show all the images in dataset; ' + 'default "sys.maxsize", show all images in dataset') + parser.add_argument( + '--output-dir', + default='assigned_results', + type=str, + help='The name of the folder where the image is saved.') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference.') + parser.add_argument( + '--show-prior', + default=False, + action='store_true', + help='Whether to show prior on image.') + parser.add_argument( + '--not-show-label', + default=False, + action='store_true', + help='Whether to show label on image.') + parser.add_argument('--seed', default=-1, type=int, help='random seed') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + register_all_modules() + + # set random seed + seed = int(args.seed) + if seed != -1: + print(f'Set the global seed: {seed}') + random.seed(int(args.seed)) + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # build model + model = MODELS.build(cfg.model) + if args.checkpoint is not None: + load_checkpoint(model, args.checkpoint) + elif isinstance(model.bbox_head, (YOLOv7HeadAssigner, RTMHeadAssigner)): + warnings.warn( + 'if you use dynamic_assignment methods such as YOLOv7 or ' + 'YOLOv8 or RTMDet assigner, please load the checkpoint.') + assert isinstance(model.bbox_head, (YOLOv5HeadAssigner, + YOLOv7HeadAssigner, + YOLOv8HeadAssigner, + RTMHeadAssigner)), \ + 'Now, this script only support YOLOv5, YOLOv7, YOLOv8 and RTMdet, ' \ + 'and bbox_head must use ' \ + '`YOLOv5HeadAssigner or YOLOv7HeadAssigne or YOLOv8HeadAssigner ' \ + 'or RTMHeadAssigner`. Please use `' \ + 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py' \ + 'or yolov7_tiny_syncbn_fast_8x16b-300e_coco_assignervisualization.py' \ + 'or yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py' \ + 'or rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py' \ + """` as config file.""" + model.eval() + model.to(args.device) + + # build dataset + dataset_cfg = cfg.get('train_dataloader').get('dataset') + dataset = DATASETS.build(dataset_cfg) + + # get collate_fn + collate_fn_cfg = cfg.get('train_dataloader').pop( + 'collate_fn', dict(type='pseudo_collate')) + collate_fn_type = collate_fn_cfg.pop('type') + collate_fn = COLLATE_FUNCTIONS.get(collate_fn_type) + + # init visualizer + visualizer = YOLOAssignerVisualizer( + vis_backends=[{ + 'type': 'LocalVisBackend' + }], name='visualizer') + visualizer.dataset_meta = dataset.metainfo + # need priors size to draw priors + + if hasattr(model.bbox_head.prior_generator, 'base_anchors'): + visualizer.priors_size = model.bbox_head.prior_generator.base_anchors + + # make output dir + os.makedirs(args.output_dir, exist_ok=True) + print('Results will save to ', args.output_dir) + + # init visualization image number + assert args.show_number > 0 + display_number = min(args.show_number, len(dataset)) + + progress_bar = ProgressBar(display_number) + for ind_img in range(display_number): + data = dataset.prepare_data(ind_img) + if data is None: + print('Unable to visualize {} due to strong data augmentations'. + format(dataset[ind_img]['data_samples'].img_path)) + continue + # convert data to batch format + batch_data = collate_fn([data]) + with torch.no_grad(): + assign_results = model.assign(batch_data) + + img = data['inputs'].cpu().numpy().astype(np.uint8).transpose( + (1, 2, 0)) + # bgr2rgb + img = mmcv.bgr2rgb(img) + + gt_instances = data['data_samples'].gt_instances + + img_show = visualizer.draw_assign(img, assign_results, gt_instances, + args.show_prior, args.not_show_label) + + if hasattr(data['data_samples'], 'img_path'): + filename = osp.basename(data['data_samples'].img_path) + else: + # some dataset have not image path + filename = f'{ind_img}.jpg' + out_file = osp.join(args.output_dir, filename) + + # convert rgb 2 bgr and save img + mmcv.imwrite(mmcv.rgb2bgr(img_show), out_file) + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py new file mode 100644 index 0000000000000000000000000000000000000000..006502eb45af9ece927b68359525cc6c2de30788 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py @@ -0,0 +1,9 @@ +_base_ = ['../../../configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py'] + +custom_imports = dict(imports=[ + 'projects.assigner_visualization.detectors', + 'projects.assigner_visualization.dense_heads' +]) + +model = dict( + type='YOLODetectorAssigner', bbox_head=dict(type='RTMHeadAssigner')) diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py new file mode 100644 index 0000000000000000000000000000000000000000..1db799b5142375c86bd5a018764017c9d3170a07 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py @@ -0,0 +1,11 @@ +_base_ = [ + '../../../configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' +] + +custom_imports = dict(imports=[ + 'projects.assigner_visualization.detectors', + 'projects.assigner_visualization.dense_heads' +]) + +model = dict( + type='YOLODetectorAssigner', bbox_head=dict(type='YOLOv5HeadAssigner')) diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/yolov7_tiny_syncbn_fast_8xb16-300e_coco_assignervisualization.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/yolov7_tiny_syncbn_fast_8xb16-300e_coco_assignervisualization.py new file mode 100644 index 0000000000000000000000000000000000000000..626dc18b59df3b9ced0781347989b65f64de5042 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/yolov7_tiny_syncbn_fast_8xb16-300e_coco_assignervisualization.py @@ -0,0 +1,9 @@ +_base_ = ['../../../configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py'] + +custom_imports = dict(imports=[ + 'projects.assigner_visualization.detectors', + 'projects.assigner_visualization.dense_heads' +]) + +model = dict( + type='YOLODetectorAssigner', bbox_head=dict(type='YOLOv7HeadAssigner')) diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py new file mode 100644 index 0000000000000000000000000000000000000000..03dcae8c39a09c0200dc52123efc1bc0a348dea3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/configs/yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py @@ -0,0 +1,9 @@ +_base_ = ['../../../configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py'] + +custom_imports = dict(imports=[ + 'projects.assigner_visualization.detectors', + 'projects.assigner_visualization.dense_heads' +]) + +model = dict( + type='YOLODetectorAssigner', bbox_head=dict(type='YOLOv8HeadAssigner')) diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/__init__.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..82adaaba8ebe3510895ebc3d5ed5ac7c573b41b2 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .rtmdet_head_assigner import RTMHeadAssigner +from .yolov5_head_assigner import YOLOv5HeadAssigner +from .yolov7_head_assigner import YOLOv7HeadAssigner +from .yolov8_head_assigner import YOLOv8HeadAssigner + +__all__ = [ + 'YOLOv5HeadAssigner', 'YOLOv7HeadAssigner', 'YOLOv8HeadAssigner', + 'RTMHeadAssigner' +] diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/rtmdet_head_assigner.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/rtmdet_head_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..d3ae1c86d054d02a7a8537ee91251c0cca39edc6 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/rtmdet_head_assigner.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +from mmdet.structures.bbox import distance2bbox +from mmdet.utils import InstanceList +from torch import Tensor + +from mmyolo.models import RTMDetHead +from mmyolo.models.utils import gt_instances_preprocess +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class RTMHeadAssigner(RTMDetHead): + + def assign_by_gt_and_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + inputs_hw: Union[Tensor, tuple] = (640, 640) + ) -> dict: + """Calculate the assigning results based on the gt and features + extracted by the detection head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + inputs_hw (Union[Tensor, tuple]): Height and width of inputs size. + Returns: + dict[str, Tensor]: A dictionary of assigning results. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + # rtmdet's prior offset differs from others + prior_offset = self.prior_generator.offset + + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + device = cls_scores[0].device + + # If the shape does not equal, generate new one + if featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = featmap_sizes + mlvl_priors_with_stride = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1).contiguous() + + flatten_bboxes = torch.cat([ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ], 1) + flatten_bboxes = flatten_bboxes * self.flatten_priors_train[..., -1, + None] + flatten_bboxes = distance2bbox(self.flatten_priors_train[..., :2], + flatten_bboxes) + + assigned_result = self.assigner(flatten_bboxes.detach(), + flatten_cls_scores.detach(), + self.flatten_priors_train, gt_labels, + gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 4) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + targets = bbox_targets[pos_inds] + gt_bboxes = gt_bboxes.squeeze(0) + matched_gt_inds = torch.tensor( + [((t == gt_bboxes).sum(dim=1) == t.shape[0]).nonzero()[0] + for t in targets], + device=device) + + level_inds = torch.zeros_like(labels) + img_inds = torch.zeros_like(labels) + level_nums = [0] + [f[0] * f[1] for f in featmap_sizes] + for i in range(len(level_nums) - 1): + level_nums[i + 1] = level_nums[i] + level_nums[i + 1] + level_inds[level_nums[i]:level_nums[i + 1]] = i + level_inds_pos = level_inds[pos_inds] + + img_inds = img_inds[pos_inds] + labels = labels[pos_inds] + + inputs_hw = batch_img_metas[0]['batch_input_shape'] + assign_results = [] + for i in range(self.num_levels): + retained_inds = level_inds_pos == i + if not retained_inds.any(): + assign_results_prior = { + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + 0, + 'offset': + prior_offset + } + else: + w = inputs_hw[1] // self.featmap_strides[i] + + retained_pos_inds = pos_inds[retained_inds] - level_nums[i] + grid_y_inds = retained_pos_inds // w + grid_x_inds = retained_pos_inds - retained_pos_inds // w * w + assign_results_prior = { + 'stride': self.featmap_strides[i], + 'grid_x_inds': grid_x_inds, + 'grid_y_inds': grid_y_inds, + 'img_inds': img_inds[retained_inds], + 'class_inds': labels[retained_inds], + 'retained_gt_inds': matched_gt_inds[retained_inds], + 'prior_ind': 0, + 'offset': prior_offset + } + assign_results.append([assign_results_prior]) + return assign_results + + def assign(self, batch_data_samples: Union[list, dict], + inputs_hw: Union[tuple, torch.Size]) -> dict: + """Calculate assigning results. This function is provided to the + `assigner_visualization.py` script. + + Args: + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + inputs_hw: Height and width of inputs size + + Returns: + dict: A dictionary of assigning components. + """ + if isinstance(batch_data_samples, list): + raise NotImplementedError( + 'assigning results_list is not implemented') + else: + # Fast version + cls_scores, bbox_preds = self(batch_data_samples['feats']) + assign_inputs = (cls_scores, bbox_preds, + batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas'], inputs_hw) + assign_results = self.assign_by_gt_and_feat(*assign_inputs) + return assign_results diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov5_head_assigner.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov5_head_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..599963fede32fc02c73db8c744dfbc2946dd53fb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov5_head_assigner.py @@ -0,0 +1,188 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Union + +import torch +from mmdet.models.utils import unpack_gt_instances +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.models import YOLOv5Head +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLOv5HeadAssigner(YOLOv5Head): + + def assign_by_gt_and_feat( + self, + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + inputs_hw: Union[Tensor, tuple] = (640, 640) + ) -> dict: + """Calculate the assigning results based on the gt and features + extracted by the detection head. + + Args: + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + inputs_hw (Union[Tensor, tuple]): Height and width of inputs size. + Returns: + dict[str, Tensor]: A dictionary of assigning results. + """ + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + device = batch_targets_normed.device + scaled_factor = torch.ones(7, device=device) + gt_inds = torch.arange( + batch_targets_normed.shape[1], + dtype=torch.long, + device=device, + requires_grad=False).unsqueeze(0).repeat((self.num_base_priors, 1)) + + assign_results = [] + for i in range(self.num_levels): + assign_results_feat = [] + h = inputs_hw[0] // self.featmap_strides[i] + w = inputs_hw[1] // self.featmap_strides[i] + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + for k in range(self.num_base_priors): + assign_results_feat.append({ + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + k + }) + assign_results.append(assign_results_feat) + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor([w, h, w, h]) + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + match_gt_inds = gt_inds[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + for k in range(self.num_base_priors): + assign_results_feat.append({ + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + k + }) + assign_results.append(assign_results_feat) + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_gt_inds = match_gt_inds.repeat((5, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets + priors_inds, (img_inds, class_inds) = priors_inds.long().view( + -1), img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + for k in range(self.num_base_priors): + retained_inds = priors_inds == k + assign_results_prior = { + 'stride': self.featmap_strides[i], + 'grid_x_inds': grid_x_inds[retained_inds], + 'grid_y_inds': grid_y_inds[retained_inds], + 'img_inds': img_inds[retained_inds], + 'class_inds': class_inds[retained_inds], + 'retained_gt_inds': retained_gt_inds[retained_inds], + 'prior_ind': k + } + assign_results_feat.append(assign_results_prior) + assign_results.append(assign_results_feat) + return assign_results + + def assign(self, batch_data_samples: Union[list, dict], + inputs_hw: Union[tuple, torch.Size]) -> dict: + """Calculate assigning results. This function is provided to the + `assigner_visualization.py` script. + + Args: + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + inputs_hw: Height and width of inputs size + + Returns: + dict: A dictionary of assigning components. + """ + if isinstance(batch_data_samples, list): + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + assign_inputs = (batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore, inputs_hw) + else: + # Fast version + assign_inputs = (batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas'], inputs_hw) + assign_results = self.assign_by_gt_and_feat(*assign_inputs) + + return assign_results diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov7_head_assigner.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov7_head_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..de2a90e36b57f5ad54158ee546dac6cf513cd5a3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov7_head_assigner.py @@ -0,0 +1,159 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +from mmdet.utils import InstanceList +from torch import Tensor + +from mmyolo.models import YOLOv7Head +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLOv7HeadAssigner(YOLOv7Head): + + def assign_by_gt_and_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + inputs_hw: Union[Tensor, tuple], + ) -> dict: + """Calculate the assigning results based on the gt and features + extracted by the detection head. + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + inputs_hw (Union[Tensor, tuple]): Height and width of inputs size. + Returns: + dict[str, Tensor]: A dictionary of assigning results. + """ + device = cls_scores[0][0].device + + head_preds = self._merge_predict_results(bbox_preds, objectnesses, + cls_scores) + + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + # yolov5_assign and simota_assign + assigner_results = self.assigner( + head_preds, + batch_targets_normed, + batch_img_metas[0]['batch_input_shape'], + self.priors_base_sizes, + self.grid_offset, + near_neighbor_thr=self.near_neighbor_thr) + + # multi-level positive sample position. + mlvl_positive_infos = assigner_results['mlvl_positive_infos'] + # assigned results with label and bboxes information. + mlvl_targets_normed = assigner_results['mlvl_targets_normed'] + + assign_results = [] + for i in range(self.num_levels): + assign_results_feat = [] + # no gt bbox matches anchor + if mlvl_positive_infos[i].shape[0] == 0: + for k in range(self.num_base_priors): + assign_results_feat.append({ + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + k + }) + assign_results.append(assign_results_feat) + continue + + # (batch_idx, prior_idx, x_scaled, y_scaled) + positive_info = mlvl_positive_infos[i] + targets_normed = mlvl_targets_normed[i] + priors_inds = positive_info[:, 1] + grid_x_inds = positive_info[:, 2] + grid_y_inds = positive_info[:, 3] + img_inds = targets_normed[:, 0] + class_inds = targets_normed[:, 1].long() + retained_gt_inds = self.get_gt_inds( + targets_normed, batch_targets_normed[0]).long() + for k in range(self.num_base_priors): + retained_inds = priors_inds == k + assign_results_prior = { + 'stride': self.featmap_strides[i], + 'grid_x_inds': grid_x_inds[retained_inds], + 'grid_y_inds': grid_y_inds[retained_inds], + 'img_inds': img_inds[retained_inds], + 'class_inds': class_inds[retained_inds], + 'retained_gt_inds': retained_gt_inds[retained_inds], + 'prior_ind': k + } + assign_results_feat.append(assign_results_prior) + assign_results.append(assign_results_feat) + return assign_results + + def get_gt_inds(self, assigned_target, gt_instance): + """Judging which one gt_ind is assigned by comparing assign_target and + origin target. + + Args: + assigned_target (Tensor(assign_nums,7)): YOLOv7 assigning results. + gt_instance (Tensor(gt_nums,7)): Normalized gt_instance, It + usually includes ``bboxes`` and ``labels`` attributes. + Returns: + gt_inds (Tensor): the index which one gt is assigned. + """ + gt_inds = torch.zeros(assigned_target.shape[0]) + for i in range(assigned_target.shape[0]): + gt_inds[i] = ((assigned_target[i] == gt_instance).sum( + dim=1) == 7).nonzero().squeeze() + return gt_inds + + def assign(self, batch_data_samples: Union[list, dict], + inputs_hw: Union[tuple, torch.Size]) -> dict: + """Calculate assigning results. + + This function is provided to the + `assigner_visualization.py` script. + Args: + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + inputs_hw: Height and width of inputs size + Returns: + dict: A dictionary of assigning components. + """ + if isinstance(batch_data_samples, list): + raise NotImplementedError( + 'assigning results_list is not implemented') + else: + # Fast version + cls_scores, bbox_preds, objectnesses = self( + batch_data_samples['feats']) + assign_inputs = (cls_scores, bbox_preds, objectnesses, + batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas'], inputs_hw) + assign_results = self.assign_by_gt_and_feat(*assign_inputs) + return assign_results diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov8_head_assigner.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov8_head_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..49d254fdf5ae1e941b5c9b906223ec47311439c3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov8_head_assigner.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +from mmdet.utils import InstanceList +from torch import Tensor + +from mmyolo.models import YOLOv8Head +from mmyolo.models.utils import gt_instances_preprocess +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLOv8HeadAssigner(YOLOv8Head): + + def assign_by_gt_and_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + inputs_hw: Union[Tensor, tuple] = (640, 640) + ) -> dict: + """Calculate the assigning results based on the gt and features + extracted by the detection head. + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + inputs_hw (Union[Tensor, tuple]): Height and width of inputs size. + Returns: + dict[str, Tensor]: A dictionary of assigning results. + """ + num_imgs = len(batch_img_metas) + device = cls_scores[0].device + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, n, 4 * reg_max) + + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + + assigned_result = self.assigner( + (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype), + flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train, + gt_labels, gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 4) + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'].squeeze(0) + + pos_inds = fg_mask_pre_prior.nonzero().squeeze(1) + + targets = bbox_targets[pos_inds] + gt_bboxes = gt_bboxes.squeeze(0) + matched_gt_inds = torch.tensor( + [((t == gt_bboxes).sum(dim=1) == t.shape[0]).nonzero()[0] + for t in targets], + device=device) + + level_inds = torch.zeros_like(labels) + img_inds = torch.zeros_like(labels) + level_nums = [0] + self.num_level_priors + for i in range(len(level_nums) - 1): + level_nums[i + 1] = level_nums[i] + level_nums[i + 1] + level_inds[level_nums[i]:level_nums[i + 1]] = i + level_inds_pos = level_inds[pos_inds] + + img_inds = img_inds[pos_inds] + labels = labels[pos_inds] + + assign_results = [] + for i in range(self.num_levels): + retained_inds = level_inds_pos == i + if not retained_inds.any(): + assign_results_prior = { + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + 0 + } + else: + w = inputs_hw[1] // self.featmap_strides[i] + + retained_pos_inds = pos_inds[retained_inds] - level_nums[i] + grid_y_inds = retained_pos_inds // w + grid_x_inds = retained_pos_inds - retained_pos_inds // w * w + assign_results_prior = { + 'stride': self.featmap_strides[i], + 'grid_x_inds': grid_x_inds, + 'grid_y_inds': grid_y_inds, + 'img_inds': img_inds[retained_inds], + 'class_inds': labels[retained_inds], + 'retained_gt_inds': matched_gt_inds[retained_inds], + 'prior_ind': 0 + } + assign_results.append([assign_results_prior]) + return assign_results + + def assign(self, batch_data_samples: Union[list, dict], + inputs_hw: Union[tuple, torch.Size]) -> dict: + """Calculate assigning results. + + This function is provided to the + `assigner_visualization.py` script. + Args: + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + inputs_hw: Height and width of inputs size + Returns: + dict: A dictionary of assigning components. + """ + if isinstance(batch_data_samples, list): + raise NotImplementedError( + 'assigning results_list is not implemented') + else: + # Fast version + cls_scores, bbox_preds = self(batch_data_samples['feats']) + assign_inputs = (cls_scores, bbox_preds, + batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas'], inputs_hw) + assign_results = self.assign_by_gt_and_feat(*assign_inputs) + return assign_results diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/detectors/__init__.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..155606a0136ef3e93d90347773af3eb7010b27ac --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/detectors/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from projects.assigner_visualization.detectors.yolo_detector_assigner import \ + YOLODetectorAssigner + +__all__ = ['YOLODetectorAssigner'] diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/detectors/yolo_detector_assigner.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/detectors/yolo_detector_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..5b723e01f65381155aaae962415d3c70040de06b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/detectors/yolo_detector_assigner.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +from mmyolo.models import YOLODetector +from mmyolo.registry import MODELS +from projects.assigner_visualization.dense_heads import (RTMHeadAssigner, + YOLOv7HeadAssigner, + YOLOv8HeadAssigner) + + +@MODELS.register_module() +class YOLODetectorAssigner(YOLODetector): + + def assign(self, data: dict) -> Union[dict, list]: + """Calculate assigning results from a batch of inputs and data + samples.This function is provided to the `assigner_visualization.py` + script. + + Args: + data (dict or tuple or list): Data sampled from dataset. + + Returns: + dict: A dictionary of assigning components. + """ + assert isinstance(data, dict) + assert len(data['inputs']) == 1, 'Only support batchsize == 1' + data = self.data_preprocessor(data, True) + available_assigners = (YOLOv7HeadAssigner, YOLOv8HeadAssigner, + RTMHeadAssigner) + if isinstance(self.bbox_head, available_assigners): + data['data_samples']['feats'] = self.extract_feat(data['inputs']) + inputs_hw = data['inputs'].shape[-2:] + assign_results = self.bbox_head.assign(data['data_samples'], inputs_hw) + return assign_results diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/visualization/__init__.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/visualization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..521a25b8837cf084e78fffa9f84660a4c9ae02bb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/visualization/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .assigner_visualizer import YOLOAssignerVisualizer + +__all__ = ['YOLOAssignerVisualizer'] diff --git a/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/visualization/assigner_visualizer.py b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/visualization/assigner_visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..fe1f4f0b90da2bbd683e3f9845efb66c9348459e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/assigner_visualization/visualization/assigner_visualizer.py @@ -0,0 +1,326 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Union + +import mmcv +import numpy as np +import torch +from mmdet.structures.bbox import HorizontalBoxes +from mmdet.visualization import DetLocalVisualizer +from mmdet.visualization.palette import _get_adaptive_scales, get_palette +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import VISUALIZERS + + +@VISUALIZERS.register_module() +class YOLOAssignerVisualizer(DetLocalVisualizer): + """MMYOLO Detection Assigner Visualizer. + + This class is provided to the `assigner_visualization.py` script. + Args: + name (str): Name of the instance. Defaults to 'visualizer'. + """ + + def __init__(self, name: str = 'visualizer', *args, **kwargs): + super().__init__(name=name, *args, **kwargs) + # need priors_size from config + self.priors_size = None + + def draw_grid(self, + stride: int = 8, + line_styles: Union[str, List[str]] = ':', + colors: Union[str, tuple, List[str], + List[tuple]] = (180, 180, 180), + line_widths: Union[Union[int, float], + List[Union[int, float]]] = 1): + """Draw grids on image. + + Args: + stride (int): Downsample factor of feature map. + line_styles (Union[str, List[str]]): The linestyle + of lines. ``line_styles`` can have the same length with + texts or just single value. If ``line_styles`` is single + value, all the lines will have the same linestyle. + Reference to + https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle + for more details. Defaults to ':'. + colors (Union[str, tuple, List[str], List[tuple]]): The colors of + lines. ``colors`` can have the same length with lines or just + single value. If ``colors`` is single value, all the lines + will have the same colors. Reference to + https://matplotlib.org/stable/gallery/color/named_colors.html + for more details. Defaults to (180, 180, 180). + line_widths (Union[Union[int, float], List[Union[int, float]]]): + The linewidth of lines. ``line_widths`` can have + the same length with lines or just single value. + If ``line_widths`` is single value, all the lines will + have the same linewidth. Defaults to 1. + """ + assert self._image is not None, 'Please set image using `set_image`' + # draw vertical lines + x_datas_vertical = ((np.arange(self.width // stride - 1) + 1) * + stride).reshape((-1, 1)).repeat( + 2, axis=1) + y_datas_vertical = np.array([[0, self.height - 1]]).repeat( + self.width // stride - 1, axis=0) + self.draw_lines( + x_datas_vertical, + y_datas_vertical, + colors=colors, + line_styles=line_styles, + line_widths=line_widths) + + # draw horizontal lines + x_datas_horizontal = np.array([[0, self.width - 1]]).repeat( + self.height // stride - 1, axis=0) + y_datas_horizontal = ((np.arange(self.height // stride - 1) + 1) * + stride).reshape((-1, 1)).repeat( + 2, axis=1) + self.draw_lines( + x_datas_horizontal, + y_datas_horizontal, + colors=colors, + line_styles=line_styles, + line_widths=line_widths) + + def draw_instances_assign(self, + instances: InstanceData, + retained_gt_inds: Tensor, + not_show_label: bool = False): + """Draw instances of GT. + + Args: + instances (:obj:`InstanceData`): gt_instance. It usually + includes ``bboxes`` and ``labels`` attributes. + retained_gt_inds (Tensor): The gt indexes assigned as the + positive sample in the current prior. + not_show_label (bool): Whether to show gt labels on images. + """ + assert self.dataset_meta is not None + classes = self.dataset_meta['classes'] + palette = self.dataset_meta['palette'] + if len(retained_gt_inds) == 0: + return self.get_image() + draw_gt_inds = torch.from_numpy( + np.array( + list(set(retained_gt_inds.cpu().numpy())), dtype=np.int64)) + bboxes = instances.bboxes[draw_gt_inds] + labels = instances.labels[draw_gt_inds] + + if not isinstance(bboxes, Tensor): + bboxes = bboxes.tensor + + edge_colors = [palette[i] for i in labels] + + max_label = int(max(labels) if len(labels) > 0 else 0) + text_palette = get_palette(self.text_color, max_label + 1) + text_colors = [text_palette[label] for label in labels] + + self.draw_bboxes( + bboxes, + edge_colors=edge_colors, + alpha=self.alpha, + line_widths=self.line_width) + + if not not_show_label: + positions = bboxes[:, :2] + self.line_width + areas = (bboxes[:, 3] - bboxes[:, 1]) * ( + bboxes[:, 2] - bboxes[:, 0]) + scales = _get_adaptive_scales(areas) + for i, (pos, label) in enumerate(zip(positions, labels)): + label_text = classes[ + label] if classes is not None else f'class {label}' + + self.draw_texts( + label_text, + pos, + colors=text_colors[i], + font_sizes=int(13 * scales[i]), + bboxes=[{ + 'facecolor': 'black', + 'alpha': 0.8, + 'pad': 0.7, + 'edgecolor': 'none' + }]) + + def draw_positive_assign(self, + grid_x_inds: Tensor, + grid_y_inds: Tensor, + class_inds: Tensor, + stride: int, + bboxes: Union[Tensor, HorizontalBoxes], + retained_gt_inds: Tensor, + offset: float = 0.5): + """ + + Args: + grid_x_inds (Tensor): The X-axis indexes of the positive sample + in current prior. + grid_y_inds (Tensor): The Y-axis indexes of the positive sample + in current prior. + class_inds (Tensor): The classes indexes of the positive sample + in current prior. + stride (int): Downsample factor of feature map. + bboxes (Union[Tensor, HorizontalBoxes]): Bounding boxes of GT. + retained_gt_inds (Tensor): The gt indexes assigned as the + positive sample in the current prior. + offset (float): The offset of points, the value is normalized + with corresponding stride. Defaults to 0.5. + """ + if not isinstance(bboxes, Tensor): + # Convert HorizontalBoxes to Tensor + bboxes = bboxes.tensor + + # The PALETTE in the dataset_meta is required + assert self.dataset_meta is not None + palette = self.dataset_meta['palette'] + x = ((grid_x_inds + offset) * stride).long() + y = ((grid_y_inds + offset) * stride).long() + center = torch.stack((x, y), dim=-1) + + retained_bboxes = bboxes[retained_gt_inds] + bbox_wh = retained_bboxes[:, 2:] - retained_bboxes[:, :2] + bbox_area = bbox_wh[:, 0] * bbox_wh[:, 1] + radius = _get_adaptive_scales(bbox_area) * 4 + colors = [palette[i] for i in class_inds] + + self.draw_circles( + center, + radius, + colors, + line_widths=0, + face_colors=colors, + alpha=1.0) + + def draw_prior(self, + grid_x_inds: Tensor, + grid_y_inds: Tensor, + class_inds: Tensor, + stride: int, + feat_ind: int, + prior_ind: int, + offset: float = 0.5): + """Draw priors on image. + + Args: + grid_x_inds (Tensor): The X-axis indexes of the positive sample + in current prior. + grid_y_inds (Tensor): The Y-axis indexes of the positive sample + in current prior. + class_inds (Tensor): The classes indexes of the positive sample + in current prior. + stride (int): Downsample factor of feature map. + feat_ind (int): Index of featmap. + prior_ind (int): Index of prior in current featmap. + offset (float): The offset of points, the value is normalized + with corresponding stride. Defaults to 0.5. + """ + + palette = self.dataset_meta['palette'] + center_x = ((grid_x_inds + offset) * stride) + center_y = ((grid_y_inds + offset) * stride) + xyxy = torch.stack((center_x, center_y, center_x, center_y), dim=1) + device = xyxy.device + if self.priors_size is not None: + xyxy += self.priors_size[feat_ind][prior_ind].to(device) + else: + xyxy += torch.tensor( + [[-stride / 2, -stride / 2, stride / 2, stride / 2]], + device=device) + + colors = [palette[i] for i in class_inds] + self.draw_bboxes( + xyxy, + edge_colors=colors, + alpha=self.alpha, + line_styles='--', + line_widths=math.ceil(self.line_width * 0.3)) + + def draw_assign(self, + image: np.ndarray, + assign_results: List[List[dict]], + gt_instances: InstanceData, + show_prior: bool = False, + not_show_label: bool = False) -> np.ndarray: + """Draw assigning results. + + Args: + image (np.ndarray): The image to draw. + assign_results (list): The assigning results. + gt_instances (:obj:`InstanceData`): Data structure for + instance-level annotations or predictions. + show_prior (bool): Whether to show prior on image. + not_show_label (bool): Whether to show gt labels on images. + + Returns: + np.ndarray: the drawn image which channel is RGB. + """ + img_show_list = [] + for feat_ind, assign_results_feat in enumerate(assign_results): + img_show_list_feat = [] + for prior_ind, assign_results_prior in enumerate( + assign_results_feat): + self.set_image(image) + h, w = image.shape[:2] + + # draw grid + stride = assign_results_prior['stride'] + self.draw_grid(stride) + + # draw prior on matched gt + grid_x_inds = assign_results_prior['grid_x_inds'] + grid_y_inds = assign_results_prior['grid_y_inds'] + class_inds = assign_results_prior['class_inds'] + prior_ind = assign_results_prior['prior_ind'] + offset = assign_results_prior.get('offset', 0.5) + + if show_prior: + self.draw_prior(grid_x_inds, grid_y_inds, class_inds, + stride, feat_ind, prior_ind, offset) + + # draw matched gt + retained_gt_inds = assign_results_prior['retained_gt_inds'] + self.draw_instances_assign(gt_instances, retained_gt_inds, + not_show_label) + + # draw positive + self.draw_positive_assign(grid_x_inds, grid_y_inds, class_inds, + stride, gt_instances.bboxes, + retained_gt_inds, offset) + + # draw title + if self.priors_size is not None: + base_prior = self.priors_size[feat_ind][prior_ind] + else: + base_prior = [stride, stride, stride * 2, stride * 2] + prior_size = (base_prior[2] - base_prior[0], + base_prior[3] - base_prior[1]) + pos = np.array((20, 20)) + text = f'feat_ind: {feat_ind} ' \ + f'prior_ind: {prior_ind} ' \ + f'prior_size: ({prior_size[0]}, {prior_size[1]})' + scales = _get_adaptive_scales(np.array([h * w / 16])) + font_sizes = int(13 * scales) + self.draw_texts( + text, + pos, + colors=self.text_color, + font_sizes=font_sizes, + bboxes=[{ + 'facecolor': 'black', + 'alpha': 0.8, + 'pad': 0.7, + 'edgecolor': 'none' + }]) + + img_show = self.get_image() + img_show = mmcv.impad(img_show, padding=(5, 5, 5, 5)) + img_show_list_feat.append(img_show) + img_show_list.append(np.concatenate(img_show_list_feat, axis=1)) + + # Merge all images into one image + # setting axis is to beautify the merged image + axis = 0 if len(assign_results[0]) > 1 else 1 + return np.concatenate(img_show_list, axis=axis) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/README.md b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1816e7ed96ee34209c56af4a22eda5f1eb7e499b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/README.md @@ -0,0 +1,11 @@ +# MMYOLO Model Easy-Deployment + +## Introduction + +This project is developed for easily converting your MMYOLO models to other inference backends without the need of MMDeploy, which reduces the cost of both time and effort on getting familiar with MMDeploy. + +Currently we support converting to `ONNX` and `TensorRT` formats, other inference backends such `ncnn` will be added to this project as well. + +## Supported Backends + +- [Model Convert](docs/model_convert.md) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/README_zh-CN.md b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..4c6bc0cf4ef91edeced04bdf15af08ae1f6f0dcd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/README_zh-CN.md @@ -0,0 +1,11 @@ +# MMYOLO 模型转换 + +## 介绍 + +本项目作为 MMYOLO 的部署 project 单独存在,意图剥离 MMDeploy 当前的体系,独自支持用户完成模型训练后的转换和部署功能,使用户的学习和工程成本下降。 + +当前支持对 ONNX 格式和 TensorRT 格式的转换,后续对其他推理平台也会支持起来。 + +## 转换教程 + +- [Model Convert](docs/model_convert.md) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/backbone/__init__.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dc167f8515c66a30d884ed9655a11d45e21481c0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/backbone/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .common import DeployC2f +from .focus import DeployFocus, GConvFocus, NcnnFocus + +__all__ = ['DeployFocus', 'NcnnFocus', 'GConvFocus', 'DeployC2f'] diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/backbone/common.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/backbone/common.py new file mode 100644 index 0000000000000000000000000000000000000000..617875bd979a5b9150e476544090777118087a0b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/backbone/common.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn +from torch import Tensor + + +class DeployC2f(nn.Module): + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, x: Tensor) -> Tensor: + x_main = self.main_conv(x) + x_main = [x_main, x_main[:, self.mid_channels:, ...]] + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.pop(1) + return self.final_conv(torch.cat(x_main, 1)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/backbone/focus.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/backbone/focus.py new file mode 100644 index 0000000000000000000000000000000000000000..2a19afcca1d9c4e27109daeebd83907cd9b7b284 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/backbone/focus.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class DeployFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + batch_size, channel, height, width = x.shape + x = x.reshape(batch_size, channel, -1, 2, width) + x = x.reshape(batch_size, channel, x.shape[2], 2, -1, 2) + half_h = x.shape[2] + half_w = x.shape[4] + x = x.permute(0, 5, 3, 1, 2, 4) + x = x.reshape(batch_size, channel * 4, half_h, half_w) + + return self.conv(x) + + +class NcnnFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + batch_size, c, h, w = x.shape + assert h % 2 == 0 and w % 2 == 0, f'focus for yolox needs even feature\ + height and width, got {(h, w)}.' + + x = x.reshape(batch_size, c * h, 1, w) + _b, _c, _h, _w = x.shape + g = _c // 2 + # fuse to ncnn's shufflechannel + x = x.view(_b, g, 2, _h, _w) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(_b, -1, _h, _w) + + x = x.reshape(_b, c * h * w, 1, 1) + + _b, _c, _h, _w = x.shape + g = _c // 2 + # fuse to ncnn's shufflechannel + x = x.view(_b, g, 2, _h, _w) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(_b, -1, _h, _w) + + x = x.reshape(_b, c * 4, h // 2, w // 2) + + return self.conv(x) + + +class GConvFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + device = next(orin_Focus.parameters()).device + self.weight1 = torch.tensor([[1., 0], [0, 0]]).expand(3, 1, 2, + 2).to(device) + self.weight2 = torch.tensor([[0, 0], [1., 0]]).expand(3, 1, 2, + 2).to(device) + self.weight3 = torch.tensor([[0, 1.], [0, 0]]).expand(3, 1, 2, + 2).to(device) + self.weight4 = torch.tensor([[0, 0], [0, 1.]]).expand(3, 1, 2, + 2).to(device) + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + conv1 = F.conv2d(x, self.weight1, stride=2, groups=3) + conv2 = F.conv2d(x, self.weight2, stride=2, groups=3) + conv3 = F.conv2d(x, self.weight3, stride=2, groups=3) + conv4 = F.conv2d(x, self.weight4, stride=2, groups=3) + return self.conv(torch.cat([conv1, conv2, conv3, conv4], dim=1)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/bbox_code/__init__.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/bbox_code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b85a815536a5749a15f0ad6aab2b028eb6a3fe0a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/bbox_code/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_coder import (rtmdet_bbox_decoder, yolov5_bbox_decoder, + yolox_bbox_decoder) + +__all__ = ['yolov5_bbox_decoder', 'rtmdet_bbox_decoder', 'yolox_bbox_decoder'] diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/bbox_code/bbox_coder.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/bbox_code/bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..6483cf8b0328aff3d61f1fa0788337ab536d347d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/bbox_code/bbox_coder.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from torch import Tensor + + +def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Tensor) -> Tensor: + bbox_preds = bbox_preds.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (bbox_preds[..., 2] * 2)**2 * w + h_pred = (bbox_preds[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1) + + return decoded_bboxes + + +def rtmdet_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Optional[Tensor]) -> Tensor: + stride = stride[None, :, None] + bbox_preds *= stride + tl_x = (priors[..., 0] - bbox_preds[..., 0]) + tl_y = (priors[..., 1] - bbox_preds[..., 1]) + br_x = (priors[..., 0] + bbox_preds[..., 2]) + br_y = (priors[..., 1] + bbox_preds[..., 3]) + decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) + return decoded_bboxes + + +def yolox_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Optional[Tensor]) -> Tensor: + stride = stride[None, :, None] + xys = (bbox_preds[..., :2] * stride) + priors + whs = bbox_preds[..., 2:].exp() * stride + decoded_bboxes = torch.cat([xys, whs], -1) + return decoded_bboxes diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/CMakeLists.txt b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f640bea13bacfc0f6cc2f33e598f65cf5ce0922e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 2.8.12) + +set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 72 75 86) +set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) + +project(nvdsparsebbox_mmyolo LANGUAGES CXX) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O3 -g -Wall -Werror -shared -fPIC") +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_BUILD_TYPE Release) +option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) + +# CUDA +find_package(CUDA REQUIRED) + +# TensorRT +set(TensorRT_INCLUDE_DIRS "/usr/include/x86_64-linux-gnu" CACHE STRING "TensorRT headers path") +set(TensorRT_LIBRARIES "/usr/lib/x86_64-linux-gnu" CACHE STRING "TensorRT libs path") + +# DeepStream +set(DEEPSTREAM "/opt/nvidia/deepstream/deepstream" CACHE STRING "DeepStream root path") +set(DS_LIBRARIES ${DEEPSTREAM}/lib) +set(DS_INCLUDE_DIRS ${DEEPSTREAM}/sources/includes) + +include_directories( + ${CUDA_INCLUDE_DIRS} + ${TensorRT_INCLUDE_DIRS} + ${DS_INCLUDE_DIRS}) + +add_library( + ${PROJECT_NAME} + SHARED + custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp) + +target_link_libraries(${PROJECT_NAME} PRIVATE nvinfer nvinfer_plugin) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/README.md b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/README.md new file mode 100644 index 0000000000000000000000000000000000000000..111f3765e41d558b64097d8a25585bd9c14acf4f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/README.md @@ -0,0 +1,48 @@ +# Inference MMYOLO Models with DeepStream + +This project demonstrates how to inference MMYOLO models with customized parsers in [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk). + +## Pre-requisites + +### 1. Install Nvidia Driver and CUDA + +First, please follow the official documents and instructions to install dedicated Nvidia graphic driver and CUDA matched to your gpu and target Nvidia AIoT devices. + +### 2. Install DeepStream SDK + +Second, please follow the official instruction to download and install DeepStream SDK. Currently stable version of DeepStream is v6.2. + +### 3. Generate TensorRT Engine + +As DeepStream builds on top of several NVIDIA libraries, you need to first convert your trained MMYOLO models to TensorRT engine files. We strongly recommend you to try the supported TensorRT deployment solution in [EasyDeploy](../../easydeploy/). + +## Build and Run + +Please make sure that your converted TensorRT engine is already located in the `deepstream` folder as the config shows. Create your own model config files and change the `config-file` parameter in [deepstream_app_config.txt](deepstream_app_config.txt) to the model you want to run with. + +```bash +mkdir build && cd build +cmake .. +make -j$(nproc) && make install +``` + +Then you can run the inference with this command. + +```bash +deepstream-app -c deepstream_app_config.txt +``` + +## Code Structure + +```bash +├── deepstream +│ ├── configs # config file for MMYOLO models +│ │ └── config_infer_rtmdet.txt +│ ├── custom_mmyolo_bbox_parser # customized parser for MMYOLO models to DeepStream formats +│ │ └── nvdsparsebbox_mmyolo.cpp +| ├── CMakeLists.txt +│ ├── coco_labels.txt # labels for coco detection +│ ├── deepstream_app_config.txt # deepStream reference app configs for MMYOLO models +│ ├── README_zh-CN.md +│ └── README.md +``` diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/README_zh-CN.md b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..13a85d5bc90159c3ff9f1a32e93d01e82ed2faa4 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/README_zh-CN.md @@ -0,0 +1,48 @@ +# 使用 DeepStream SDK 推理 MMYOLO 模型 + +本项目演示了如何使用 [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk) 配合改写的 parser 来推理 MMYOLO 的模型。 + +## 预先准备 + +### 1. 安装 Nidia 驱动和 CUDA + +首先请根据当前的显卡驱动和目标使用设备的驱动完成显卡驱动和 CUDA 的安装。 + +### 2. 安装 DeepStream SDK + +目前 DeepStream SDK 稳定版本已经更新到 v6.2,官方推荐使用这个版本。 + +### 3. 将 MMYOLO 模型转换为 TensorRT Engine + +推荐使用 EasyDeploy 中的 TensorRT 方案完成目标模型的转换部署,具体可参考 [此文档](../../easydeploy/docs/model_convert.md) 。 + +## 编译使用 + +当前项目使用的是 MMYOLO 的 rtmdet 模型,若想使用其他的模型,请参照目录下的配置文件进行改写。然后将转换完的 TensorRT engine 放在当前目录下并执行如下命令: + +```bash +mkdir build && cd build +cmake .. +make -j$(nproc) && make install +``` + +完成编译后可使用如下命令进行推理: + +```bash +deepstream-app -c deepstream_app_config.txt +``` + +## 项目代码结构 + +```bash +├── deepstream +│ ├── configs # MMYOLO 模型对应的 DeepStream 配置 +│ │ └── config_infer_rtmdet.txt +│ ├── custom_mmyolo_bbox_parser # 适配 DeepStream formats 的 parser +│ │ └── nvdsparsebbox_mmyolo.cpp +| ├── CMakeLists.txt +│ ├── coco_labels.txt # coco labels +│ ├── deepstream_app_config.txt # DeepStream app 配置 +│ ├── README_zh-CN.md +│ └── README.md +``` diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/coco_labels.txt b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/coco_labels.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca76c80b5b2cd0b25047f75736656cfebc9da7aa --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/coco_labels.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_rtmdet.txt b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_rtmdet.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1e5efd2a3810730144e037ee96dfbd36124b0e6 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_rtmdet.txt @@ -0,0 +1,22 @@ +[property] +gpu-id=0 +net-scale-factor=0.01735207357279195 +offsets=57.375;57.12;58.395 +model-color-format=1 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov5.txt b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov5.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7d6429cacd0a6050821e5b2a41317478f5119 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov5.txt @@ -0,0 +1,21 @@ +[property] +gpu-id=0 +net-scale-factor=0.0039215697906911373 +model-color-format=0 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov8.txt b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7d6429cacd0a6050821e5b2a41317478f5119 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov8.txt @@ -0,0 +1,21 @@ +[property] +gpu-id=0 +net-scale-factor=0.0039215697906911373 +model-color-format=0 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eb780856cbd2b289cdf9dc8518438f946a2ab548 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp @@ -0,0 +1,118 @@ +#include "nvdsinfer_custom_impl.h" +#include +#include + +/** + * Function expected by DeepStream for decoding the MMYOLO output. + * + * C-linkage [extern "C"] was written to prevent name-mangling. This function must return true after + * adding all bounding boxes to the objectList vector. + * + * @param [outputLayersInfo] std::vector of NvDsInferLayerInfo objects with information about the output layer. + * @param [networkInfo] NvDsInferNetworkInfo object with information about the MMYOLO network. + * @param [detectionParams] NvDsInferParseDetectionParams with information about some config params. + * @param [objectList] std::vector of NvDsInferParseObjectInfo objects to which bounding box information must + * be stored. + * + * @return true + */ + +// This is just the function prototype. The definition is written at the end of the file. +extern "C" bool NvDsInferParseCustomMMYOLO( + std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList); + +static __inline__ float clamp(float& val, float min, float max) +{ + return val > min ? (val < max ? val : max) : min; +} + +static std::vector decodeMMYoloTensor( + const int* num_dets, + const float* bboxes, + const float* scores, + const int* labels, + const float& conf_thres, + const unsigned int& img_w, + const unsigned int& img_h +) +{ + std::vector bboxInfo; + size_t nums = num_dets[0]; + for (size_t i = 0; i < nums; i++) + { + float score = scores[i]; + if (score < conf_thres)continue; + float x0 = (bboxes[i * 4]); + float y0 = (bboxes[i * 4 + 1]); + float x1 = (bboxes[i * 4 + 2]); + float y1 = (bboxes[i * 4 + 3]); + x0 = clamp(x0, 0.f, img_w); + y0 = clamp(y0, 0.f, img_h); + x1 = clamp(x1, 0.f, img_w); + y1 = clamp(y1, 0.f, img_h); + NvDsInferParseObjectInfo obj; + obj.left = x0; + obj.top = y0; + obj.width = x1 - x0; + obj.height = y1 - y0; + obj.detectionConfidence = score; + obj.classId = labels[i]; + bboxInfo.push_back(obj); + } + + return bboxInfo; +} + +/* C-linkage to prevent name-mangling */ +extern "C" bool NvDsInferParseCustomMMYOLO( + std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList) +{ + +// Some assertions and error checking. + if (outputLayersInfo.empty() || outputLayersInfo.size() != 4) + { + std::cerr << "Could not find output layer in bbox parsing" << std::endl; + return false; + } + +// Score threshold of bboxes. + const float conf_thres = detectionParams.perClassThreshold[0]; + +// Obtaining the output layer. + const NvDsInferLayerInfo& num_dets = outputLayersInfo[0]; + const NvDsInferLayerInfo& bboxes = outputLayersInfo[1]; + const NvDsInferLayerInfo& scores = outputLayersInfo[2]; + const NvDsInferLayerInfo& labels = outputLayersInfo[3]; + +// num_dets(int) bboxes(float) scores(float) labels(int) + assert (num_dets.dims.numDims == 2); + assert (bboxes.dims.numDims == 3); + assert (scores.dims.numDims == 2); + assert (labels.dims.numDims == 2); + + +// Decoding the output tensor of MMYOLO to the NvDsInferParseObjectInfo format. + std::vector objects = + decodeMMYoloTensor( + (const int*)(num_dets.buffer), + (const float*)(bboxes.buffer), + (const float*)(scores.buffer), + (const int*)(labels.buffer), + conf_thres, + networkInfo.width, + networkInfo.height + ); + + objectList.clear(); + objectList = objects; + return true; +} + +/* Check that the custom function has been defined correctly */ +CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomMMYOLO); diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/deepstream_app_config.txt b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/deepstream_app_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..331776897a5e9109b9007ed1b7974f128287c4fc --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/deepstream/deepstream_app_config.txt @@ -0,0 +1,62 @@ +[application] +enable-perf-measurement=1 +perf-measurement-interval-sec=5 + +[tiled-display] +enable=1 +rows=1 +columns=1 +width=1280 +height=720 +gpu-id=0 +nvbuf-memory-type=0 + +[source0] +enable=1 +type=3 +uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4 +num-sources=1 +gpu-id=0 +cudadec-memtype=0 + +[sink0] +enable=1 +type=2 +sync=0 +gpu-id=0 +nvbuf-memory-type=0 + +[osd] +enable=1 +gpu-id=0 +border-width=5 +text-size=15 +text-color=1;1;1;1; +text-bg-color=0.3;0.3;0.3;1 +font=Serif +show-clock=0 +clock-x-offset=800 +clock-y-offset=820 +clock-text-size=12 +clock-color=1;0;0;0 +nvbuf-memory-type=0 + +[streammux] +gpu-id=0 +live-source=0 +batch-size=1 +batched-push-timeout=40000 +width=1920 +height=1080 +enable-padding=0 +nvbuf-memory-type=0 + +[primary-gie] +enable=1 +gpu-id=0 +gie-unique-id=1 +nvbuf-memory-type=0 +config-file=configs/config_infer_rtmdet.txt + +[tests] +file-loop=0 diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/docs/model_convert.md b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/docs/model_convert.md new file mode 100644 index 0000000000000000000000000000000000000000..9af62599dd1b56648680fc315ca88c35c7b31cb9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/docs/model_convert.md @@ -0,0 +1,156 @@ +# MMYOLO 模型 ONNX 转换 + +## 1. 导出后端支持的 ONNX + +## 环境依赖 + +- [onnx](https://github.com/onnx/onnx) + + ```shell + pip install onnx + ``` + + [onnx-simplifier](https://github.com/daquexian/onnx-simplifier) (可选,用于简化模型) + + ```shell + pip install onnx-simplifier + ``` + +\*\*\* 请确保您在 `MMYOLO` 根目录下运行相关脚本,避免无法找到相关依赖包。\*\*\* + +## 使用方法 + +[模型导出脚本](./projects/easydeploy/tools/export_onnx.py)用于将 `MMYOLO` 模型转换为 `onnx` 。 + +### 参数介绍: + +- `config` : 构建模型使用的配置文件,如 [`yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`](./configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) 。 +- `checkpoint` : 训练得到的权重文件,如 `yolov5s.pth` 。 +- `--work-dir` : 转换后的模型保存路径。 +- `--img-size`: 转换模型时输入的尺寸,如 `640 640`。 +- `--batch-size`: 转换后的模型输入 `batch size` 。 +- `--device`: 转换模型使用的设备,默认为 `cuda:0`。 +- `--simplify`: 是否简化导出的 `onnx` 模型,需要安装 [onnx-simplifier](https://github.com/daquexian/onnx-simplifier),默认关闭。 +- `--opset`: 指定导出 `onnx` 的 `opset`,默认为 `11` 。 +- `--backend`: 指定导出 `onnx` 用于的后端名称,`ONNXRuntime`: `onnxruntime`, `TensorRT8`: `tensorrt8`, `TensorRT7`: `tensorrt7`,默认为`onnxruntime`即 `ONNXRuntime`。 +- `--pre-topk`: 指定导出 `onnx` 的后处理筛选候选框个数阈值,默认为 `1000`。 +- `--keep-topk`: 指定导出 `onnx` 的非极大值抑制输出的候选框个数阈值,默认为 `100`。 +- `--iou-threshold`: 非极大值抑制中过滤重复候选框的 `iou` 阈值,默认为 `0.65`。 +- `--score-threshold`: 非极大值抑制中过滤候选框得分的阈值,默认为 `0.25`。 +- `--model-only`: 指定仅导出模型 backbone + neck, 不包含后处理,默认关闭。 + +例子: + +```shell +python ./projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5s.pth \ + --work-dir work_dir \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --backend 1 \ + --pre-topk 1000 \ + --keep-topk 100 \ + --iou-threshold 0.65 \ + --score-threshold 0.25 +``` + +然后利用后端支持的工具如 `TensorRT` 读取 `onnx` 再次转换为后端支持的模型格式如 `.engine/.plan` 等。 + +`MMYOLO` 目前支持 `TensorRT8`, `TensorRT7`, `ONNXRuntime` 后端的端到端模型转换,目前仅支持静态 shape 模型的导出和转换,动态 batch 或动态长宽的模型端到端转换会在未来继续支持。 + +端到端转换得到的 `onnx` 模型输入输出如图: + +
+ +
+ +输入名: `images`, 尺寸 640x640 + +输出名: `num_dets`, 尺寸 1x1,表示检测目标数量。 + +输出名: `boxes`, 尺寸 1x100x4,表示检测框的坐标,格式为 `x1y1x2y1`。 + +输出名: `scores`, 尺寸 1x100,表示检测框的分数。 + +输出名: `labels`, 尺寸 1x100,表示检测框的类别 id。 + +可以利用 `num_dets` 中的个数对 `boxes`, `scores`, `labels` 进行截断,从 100 个检测结果中抽取前 `num_dets` 个目标作为最终检测结果。 + +## 2. 仅导出模型 Backbone + Neck + +当您需要部署在非 `TensorRT`, `ONNXRuntime` 等支持端到端部署的平台时,您可以考虑使用`--model-only` 参数并且不要传递 `--backend` 参数,您将会导出仅包含 `Backbone` + `neck` 的模型,模型的部分输出如图: + +
+ +
+ +这种导出方式获取的 `ONNX` 模型具有如下优点: + +- 算子简单,一般而言只包含 `Conv`,激活函数等简单算子,几乎不存在无法正确导出的情况,对于嵌入式部署更加友好。 +- 方便不同算法之间对比速度性能,由于不同的算法后处理不同,仅对比 `backbone` + `Neck` 的速度更加公平。 + +也有如下缺点: + +- 后处理逻辑需要单独完成,会有额外的 `decode` + `nms` 的操作需要实现。 +- 与 `TensorRT` 相比,由于 `TensorRT` 可以利用多核优势并行进行后处理,使用 `--model-only` 方式导出的模型性能会差很多。 + +### 使用方法 + +```shell +python ./projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5s.pth \ + --work-dir work_dir \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --model-only +``` + +## 使用 `model-only` 导出的 ONNX 进行推理 + +[模型推理脚本](./projects/easydeploy/examples/main_onnxruntime.py)用于推理导出的 `ONNX` 模型,需要安装基础依赖环境: + +[`onnxruntime`](https://github.com/microsoft/onnxruntime) 和 [`opencv-python`](https://github.com/opencv/opencv-python) + +```shell +pip install onnxruntime +pip install opencv-python==4.7.0.72 # 建议使用最新的 opencv +``` + +### 参数介绍: + +- `img` : 待检测的图片路径或图片文件夹路径。 +- `onnx` : 导出的 `model-only` ONNX 模型。 +- `--type` : 模型名称,目前支持 `yolov5`, `yolox`, `yolov6`, `ppyoloe`, `ppyoloep`, `yolov7`, `rtmdet`, `yolov8`。 +- `--img-size`: 转换模型时输入的尺寸,如 `640 640`。 +- `--out-dir`: 保存检测结果的路径 。 +- `--show`: 是否可视化检测结果。 +- `--score-thr`: 模型检测后处理的置信度分数 。 +- `--iou-thr`: 模型检测后处理的 IOU 分数 。 + +## 使用方法 + +```shell +cd ./projects/easydeploy/examples +python main_onnxruntime.py \ + "image_path_to_detect" \ + yolov5_s_model-only.onnx \ + --out-dir work_dir \ + --img-size 640 640 \ + --show \ + --score-thr 0.3 \ + --iou-thr 0.7 +``` + +*注意!!!* + +当您使用自定义数据集训练得到的模型时,请修改 [`config.py`](./projects/easydeploy/examples/config.py) 中 `CLASS_NAMES` 和 `CLASS_COLORS`,如果是 `yolov5` 或者 `yolov7` 基于 `anchor` 的模型请同时修改 `YOLOv5_ANCHORS` 和 `YOLOv7_ANCHORS`。 + +[`numpy_coder.py`](./projects/easydeploy/examples/numpy_coder.py) 是目前所有算法仅使用 `numpy` 实现的 `decoder`,如果您对性能有较高的要求,可以参照相关代码改写为 `c/c++`。 diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/config.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4a85ff34273c22a356c9d6a3eaeb048b637b5f40 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/config.py @@ -0,0 +1,64 @@ +from enum import Enum + + +class TASK_TYPE(Enum): + DET = 'det' + SEG = 'seg' + POSE = 'pose' + + +class ModelType(Enum): + YOLOV5 = 'yolov5' + YOLOX = 'yolox' + PPYOLOE = 'ppyoloe' + PPYOLOEP = 'ppyoloep' + YOLOV6 = 'yolov6' + YOLOV7 = 'yolov7' + RTMDET = 'rtmdet' + YOLOV8 = 'yolov8' + + +CLASS_NAMES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', + 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', + 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', + 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', + 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', + 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', + 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', + 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', + 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', + 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', + 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') + +CLASS_COLORS = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), + (106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70), + (0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0), + (175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255), + (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157), + (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118), + (255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182), + (0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255), + (78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255), + (134, 134, 103), (145, 148, 174), (255, 208, 186), + (197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255), + (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105), + (166, 196, 102), (208, 195, 210), (255, 109, 65), + (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0), + (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161), + (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120), + (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133), + (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62), + (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45), + (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1), + (246, 0, 122), (191, 162, 208)] + +YOLOv5_ANCHORS = [[(10, 13), (16, 30), (33, 23)], + [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]] + +YOLOv7_ANCHORS = [[(12, 16), (19, 36), (40, 28)], + [(36, 75), (76, 55), (72, 146)], + [(142, 110), (192, 243), (459, 401)]] diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/cv2_nms.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/cv2_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..79e376356b75339c796aeeb280cd8cdb52db8518 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/cv2_nms.py @@ -0,0 +1,36 @@ +from typing import List, Tuple, Union + +import cv2 +from numpy import ndarray + +MAJOR, MINOR = map(int, cv2.__version__.split('.')[:2]) +assert MAJOR == 4 + + +def non_max_suppression(boxes: Union[List[ndarray], Tuple[ndarray]], + scores: Union[List[float], Tuple[float]], + labels: Union[List[int], Tuple[int]], + conf_thres: float = 0.25, + iou_thres: float = 0.65) -> Tuple[List, List, List]: + if MINOR >= 7: + indices = cv2.dnn.NMSBoxesBatched(boxes, scores, labels, conf_thres, + iou_thres) + elif MINOR == 6: + indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres) + else: + indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, + iou_thres).flatten() + + nmsd_boxes = [] + nmsd_scores = [] + nmsd_labels = [] + for idx in indices: + box = boxes[idx] + # x0y0wh -> x0y0x1y1 + box[2:] = box[:2] + box[2:] + score = scores[idx] + label = labels[idx] + nmsd_boxes.append(box) + nmsd_scores.append(score) + nmsd_labels.append(label) + return nmsd_boxes, nmsd_scores, nmsd_labels diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/main_onnxruntime.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/main_onnxruntime.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0ad1b0f10ed6cbea8c8b3c0c5010ec7a760cb5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/main_onnxruntime.py @@ -0,0 +1,110 @@ +import math +import sys +from argparse import ArgumentParser +from pathlib import Path + +import cv2 +import onnxruntime +from config import (CLASS_COLORS, CLASS_NAMES, ModelType, YOLOv5_ANCHORS, + YOLOv7_ANCHORS) +from cv2_nms import non_max_suppression +from numpy_coder import Decoder +from preprocess import Preprocess +from tqdm import tqdm + +# Add __FILE__ to sys.path +sys.path.append(str(Path(__file__).resolve().parents[0])) + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def path_to_list(path: str): + path = Path(path) + if path.is_file() and path.suffix in IMG_EXTENSIONS: + res_list = [str(path.absolute())] + elif path.is_dir(): + res_list = [ + str(p.absolute()) for p in path.iterdir() + if p.suffix in IMG_EXTENSIONS + ] + else: + raise RuntimeError + return res_list + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('onnx', type=str, help='Onnx file') + parser.add_argument('--type', type=str, help='Model type') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument( + '--out-dir', default='./output', type=str, help='Path to output file') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--iou-thr', type=float, default=0.7, help='Bbox iou threshold') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + out_dir = Path(args.out_dir) + model_type = ModelType(args.type.lower()) + + if not args.show: + out_dir.mkdir(parents=True, exist_ok=True) + + files = path_to_list(args.img) + session = onnxruntime.InferenceSession( + args.onnx, providers=['CPUExecutionProvider']) + preprocessor = Preprocess(model_type) + decoder = Decoder(model_type, model_only=True) + if model_type == ModelType.YOLOV5: + anchors = YOLOv5_ANCHORS + elif model_type == ModelType.YOLOV7: + anchors = YOLOv7_ANCHORS + else: + anchors = None + + for file in tqdm(files): + image = cv2.imread(file) + image_h, image_w = image.shape[:2] + img, (ratio_w, ratio_h) = preprocessor(image, args.img_size) + features = session.run(None, {'images': img}) + decoder_outputs = decoder( + features, + args.score_thr, + num_labels=len(CLASS_NAMES), + anchors=anchors) + nmsd_boxes, nmsd_scores, nmsd_labels = non_max_suppression( + *decoder_outputs, args.score_thr, args.iou_thr) + for box, score, label in zip(nmsd_boxes, nmsd_scores, nmsd_labels): + x0, y0, x1, y1 = box + x0 = math.floor(min(max(x0 / ratio_w, 1), image_w - 1)) + y0 = math.floor(min(max(y0 / ratio_h, 1), image_h - 1)) + x1 = math.ceil(min(max(x1 / ratio_w, 1), image_w - 1)) + y1 = math.ceil(min(max(y1 / ratio_h, 1), image_h - 1)) + cv2.rectangle(image, (x0, y0), (x1, y1), CLASS_COLORS[label], 2) + cv2.putText(image, f'{CLASS_NAMES[label]}: {score:.2f}', + (x0, y0 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, + (0, 255, 255), 2) + if args.show: + cv2.imshow('result', image) + cv2.waitKey(0) + else: + cv2.imwrite(f'{out_dir / Path(file).name}', image) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/numpy_coder.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/numpy_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..ccd3687f89ed47dbbb1d90e603eba21a760bded9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/numpy_coder.py @@ -0,0 +1,310 @@ +from typing import List, Tuple, Union + +import numpy as np +from config import ModelType +from numpy import ndarray + + +def softmax(x: ndarray, axis: int = -1) -> ndarray: + e_x = np.exp(x - np.max(x, axis=axis, keepdims=True)) + y = e_x / e_x.sum(axis=axis, keepdims=True) + return y + + +def sigmoid(x: ndarray) -> ndarray: + return 1. / (1. + np.exp(-x)) + + +class Decoder: + + def __init__(self, model_type: ModelType, model_only: bool = False): + self.model_type = model_type + self.model_only = model_only + self.boxes_pro = [] + self.scores_pro = [] + self.labels_pro = [] + self.is_logging = False + + def __call__(self, + feats: Union[List, Tuple], + conf_thres: float, + num_labels: int = 80, + **kwargs) -> Tuple: + if not self.is_logging: + print('Only support decode in batch==1') + self.is_logging = True + self.boxes_pro.clear() + self.scores_pro.clear() + self.labels_pro.clear() + + if self.model_only: + # transpose channel to last dim for easy decoding + feats = [ + np.ascontiguousarray(feat[0].transpose(1, 2, 0)) + for feat in feats + ] + else: + # ax620a horizonX3 transpose channel to last dim by default + feats = [np.ascontiguousarray(feat) for feat in feats] + if self.model_type == ModelType.YOLOV5: + self.__yolov5_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOX: + self.__yolox_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type in (ModelType.PPYOLOE, ModelType.PPYOLOEP): + self.__ppyoloe_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV6: + self.__yolov6_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV7: + self.__yolov7_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.RTMDET: + self.__rtmdet_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV8: + self.__yolov8_decode(feats, conf_thres, num_labels, **kwargs) + else: + raise NotImplementedError + return self.boxes_pro, self.scores_pro, self.labels_pro + + def __yolov5_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + anchors: Union[List, Tuple] = kwargs.get( + 'anchors', + [[(10, 13), (16, 30), + (33, 23)], [(30, 61), (62, 45), + (59, 119)], [(116, 90), (156, 198), (373, 326)]]) + for i, feat in enumerate(feats): + stride = 8 << i + feat_h, feat_w, _ = feat.shape + anchor = anchors[i] + feat = sigmoid(feat) + feat = feat.reshape((feat_h, feat_w, len(anchor), -1)) + box_feat, conf_feat, score_feat = np.split(feat, [4, 5], -1) + + hIdx, wIdx, aIdx, _ = np.where(conf_feat > conf_thres) + + num_proposal = hIdx.size + if not num_proposal: + continue + + score_feat = score_feat[hIdx, wIdx, aIdx] * conf_feat[hIdx, wIdx, + aIdx] + boxes = box_feat[hIdx, wIdx, aIdx] + labels = score_feat.argmax(-1) + scores = score_feat.max(-1) + + indices = np.where(scores > conf_thres)[0] + if len(indices) == 0: + continue + + for idx in indices: + a_w, a_h = anchor[aIdx[idx]] + x, y, w, h = boxes[idx] + x = (x * 2.0 - 0.5 + wIdx[idx]) * stride + y = (y * 2.0 - 0.5 + hIdx[idx]) * stride + w = (w * 2.0)**2 * a_w + h = (h * 2.0)**2 * a_h + + x0 = x - w / 2 + y0 = y - h / 2 + + self.scores_pro.append(float(scores[idx])) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(labels[idx])) + + def __yolox_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat, conf_feat = np.split( + feat, [num_labels, num_labels + 4], -1) + conf_feat = sigmoid(conf_feat) + + hIdx, wIdx, _ = np.where(conf_feat > conf_thres) + + num_proposal = hIdx.size + if not num_proposal: + continue + + score_feat = sigmoid(score_feat[hIdx, wIdx]) * conf_feat[hIdx, + wIdx] + boxes = box_feat[hIdx, wIdx] + labels = score_feat.argmax(-1) + scores = score_feat.max(-1) + indices = np.where(scores > conf_thres)[0] + + if len(indices) == 0: + continue + + for idx in indices: + score = scores[idx] + label = labels[idx] + + x, y, w, h = boxes[idx] + + x = (x + wIdx[idx]) * stride + y = (y + hIdx[idx]) * stride + w = np.exp(w) * stride + h = np.exp(h) * stride + + x0 = x - w / 2 + y0 = y - h / 2 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __ppyoloe_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + reg_max: int = kwargs.get('reg_max', 17) + dfl = np.arange(0, reg_max, dtype=np.float32) + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx].reshape(num_proposal, 4, reg_max) + boxes = softmax(boxes, -1) @ dfl + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] + 0.5 - x0) * stride + y0 = (hIdx[k] + 0.5 - y0) * stride + x1 = (wIdx[k] + 0.5 + x1) * stride + y1 = (hIdx[k] + 0.5 + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov6_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx] + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] + 0.5 - x0) * stride + y0 = (hIdx[k] + 0.5 - y0) * stride + x1 = (wIdx[k] + 0.5 + x1) * stride + y1 = (hIdx[k] + 0.5 + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov7_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + anchors: Union[List, Tuple] = kwargs.get( + 'anchors', + [[(12, 16), (19, 36), + (40, 28)], [(36, 75), (76, 55), + (72, 146)], [(142, 110), (192, 243), (459, 401)]]) + self.__yolov5_decode(feats, conf_thres, num_labels, anchors=anchors) + + def __rtmdet_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx] + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] - x0) * stride + y0 = (hIdx[k] - y0) * stride + x1 = (wIdx[k] + x1) * stride + y1 = (hIdx[k] + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov8_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + reg_max: int = kwargs.get('reg_max', 16) + self.__ppyoloe_decode(feats, conf_thres, num_labels, reg_max=reg_max) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/preprocess.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..6b6fb563a16a7f40ef556b5a23f635ab4627fc4f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/preprocess.py @@ -0,0 +1,57 @@ +from typing import List, Tuple, Union + +import cv2 +import numpy as np +from config import ModelType +from numpy import ndarray + + +class Preprocess: + + def __init__(self, model_type: ModelType): + if model_type in (ModelType.YOLOV5, ModelType.YOLOV6, ModelType.YOLOV7, + ModelType.YOLOV8): + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([255, 255, 255], dtype=np.float32) + is_rgb = True + elif model_type == ModelType.YOLOX: + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([1, 1, 1], dtype=np.float32) + is_rgb = False + elif model_type == ModelType.PPYOLOE: + mean = np.array([123.675, 116.28, 103.53], dtype=np.float32) + std = np.array([58.395, 57.12, 57.375], dtype=np.float32) + is_rgb = True + + elif model_type == ModelType.PPYOLOEP: + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([255, 255, 255], dtype=np.float32) + is_rgb = True + elif model_type == ModelType.RTMDET: + mean = np.array([103.53, 116.28, 123.675], dtype=np.float32) + std = np.array([57.375, 57.12, 58.3955], dtype=np.float32) + is_rgb = False + else: + raise NotImplementedError + + self.mean = mean.reshape((3, 1, 1)) + self.std = std.reshape((3, 1, 1)) + self.is_rgb = is_rgb + + def __call__(self, + image: ndarray, + new_size: Union[List[int], Tuple[int]] = (640, 640), + **kwargs) -> Tuple[ndarray, Tuple[float, float]]: + # new_size: (height, width) + height, width = image.shape[:2] + ratio_h, ratio_w = new_size[0] / height, new_size[1] / width + image = cv2.resize( + image, (0, 0), + fx=ratio_w, + fy=ratio_h, + interpolation=cv2.INTER_LINEAR) + image = np.ascontiguousarray(image.transpose(2, 0, 1)) + image = image.astype(np.float32) + image -= self.mean + image /= self.std + return image[np.newaxis], (ratio_w, ratio_h) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/requirements.txt b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b761189b52fc57e4231b37df0ff42bb44404c95 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/examples/requirements.txt @@ -0,0 +1,2 @@ +onnxruntime +opencv-python==4.7.0.72 diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/__init__.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..38af8bc322b0a8e0c870fac243a0af9c1dba7315 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backend import MMYOLOBackend +from .backendwrapper import ORTWrapper, TRTWrapper +from .model import DeployModel + +__all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper', 'MMYOLOBackend'] diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/backend.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/backend.py new file mode 100644 index 0000000000000000000000000000000000000000..64d6e3f020bcfd3c3cf7db5f5611a8f815df4cb1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/backend.py @@ -0,0 +1,23 @@ +from enum import Enum + +import torch +import torch.nn.functional as F + + +class MMYOLOBackend(Enum): + AX620A = 'ax620a' + COREML = 'coreml' + HORIZONX3 = 'horizonx3' + NCNN = 'ncnn' + ONNXRUNTIME = 'onnxruntime' + OPENVINO = 'openvino' + PPLNN = 'pplnn' + RKNN = 'rknn' + TENSORRT8 = 'tensorrt8' + TENSORRT7 = 'tensorrt7' + TORCHSCRIPT = 'torchscript' + TVM = 'tvm' + + +def HSigmoid__forward(self, x: torch.Tensor) -> torch.Tensor: + return F.hardsigmoid(x, inplace=True) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/backendwrapper.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/backendwrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..2997d84ea98b3f30973cf2335ab0eb4af4edaef5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/backendwrapper.py @@ -0,0 +1,202 @@ +import warnings +from collections import namedtuple +from functools import partial +from pathlib import Path +from typing import List, Optional, Union + +import numpy as np +import onnxruntime + +try: + import tensorrt as trt +except Exception: + trt = None +import torch + +warnings.filterwarnings(action='ignore', category=DeprecationWarning) + + +class TRTWrapper(torch.nn.Module): + dtype_mapping = {} + + def __init__(self, weight: Union[str, Path], + device: Optional[torch.device]): + super().__init__() + weight = Path(weight) if isinstance(weight, str) else weight + assert weight.exists() and weight.suffix in ('.engine', '.plan') + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + self.weight = weight + self.device = device + self.stream = torch.cuda.Stream(device=device) + self.__update_mapping() + self.__init_engine() + self.__init_bindings() + + def __update_mapping(self): + self.dtype_mapping.update({ + trt.bool: torch.bool, + trt.int8: torch.int8, + trt.int32: torch.int32, + trt.float16: torch.float16, + trt.float32: torch.float32 + }) + + def __init_engine(self): + logger = trt.Logger(trt.Logger.ERROR) + self.log = partial(logger.log, trt.Logger.ERROR) + trt.init_libnvinfer_plugins(logger, namespace='') + self.logger = logger + with trt.Runtime(logger) as runtime: + model = runtime.deserialize_cuda_engine(self.weight.read_bytes()) + + context = model.create_execution_context() + + names = [model.get_binding_name(i) for i in range(model.num_bindings)] + + num_inputs, num_outputs = 0, 0 + + for i in range(model.num_bindings): + if model.binding_is_input(i): + num_inputs += 1 + else: + num_outputs += 1 + + self.is_dynamic = -1 in model.get_binding_shape(0) + + self.model = model + self.context = context + self.input_names = names[:num_inputs] + self.output_names = names[num_inputs:] + self.num_inputs = num_inputs + self.num_outputs = num_outputs + self.num_bindings = num_inputs + num_outputs + self.bindings: List[int] = [0] * self.num_bindings + + def __init_bindings(self): + Binding = namedtuple('Binding', ('name', 'dtype', 'shape')) + inputs_info = [] + outputs_info = [] + + for i, name in enumerate(self.input_names): + assert self.model.get_binding_name(i) == name + dtype = self.dtype_mapping[self.model.get_binding_dtype(i)] + shape = tuple(self.model.get_binding_shape(i)) + inputs_info.append(Binding(name, dtype, shape)) + + for i, name in enumerate(self.output_names): + i += self.num_inputs + assert self.model.get_binding_name(i) == name + dtype = self.dtype_mapping[self.model.get_binding_dtype(i)] + shape = tuple(self.model.get_binding_shape(i)) + outputs_info.append(Binding(name, dtype, shape)) + self.inputs_info = inputs_info + self.outputs_info = outputs_info + if not self.is_dynamic: + self.output_tensor = [ + torch.empty(o.shape, dtype=o.dtype, device=self.device) + for o in outputs_info + ] + + def forward(self, *inputs): + + assert len(inputs) == self.num_inputs + + contiguous_inputs: List[torch.Tensor] = [ + i.contiguous() for i in inputs + ] + + for i in range(self.num_inputs): + self.bindings[i] = contiguous_inputs[i].data_ptr() + if self.is_dynamic: + self.context.set_binding_shape( + i, tuple(contiguous_inputs[i].shape)) + + # create output tensors + outputs: List[torch.Tensor] = [] + + for i in range(self.num_outputs): + j = i + self.num_inputs + if self.is_dynamic: + shape = tuple(self.context.get_binding_shape(j)) + output = torch.empty( + size=shape, + dtype=self.output_dtypes[i], + device=self.device) + + else: + output = self.output_tensor[i] + outputs.append(output) + self.bindings[j] = output.data_ptr() + + self.context.execute_async_v2(self.bindings, self.stream.cuda_stream) + self.stream.synchronize() + + return tuple(outputs) + + +class ORTWrapper(torch.nn.Module): + + def __init__(self, weight: Union[str, Path], + device: Optional[torch.device]): + super().__init__() + weight = Path(weight) if isinstance(weight, str) else weight + assert weight.exists() and weight.suffix == '.onnx' + + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + self.weight = weight + self.device = device + self.__init_session() + self.__init_bindings() + + def __init_session(self): + providers = ['CPUExecutionProvider'] + if 'cuda' in self.device.type: + providers.insert(0, 'CUDAExecutionProvider') + + session = onnxruntime.InferenceSession( + str(self.weight), providers=providers) + self.session = session + + def __init_bindings(self): + Binding = namedtuple('Binding', ('name', 'dtype', 'shape')) + inputs_info = [] + outputs_info = [] + self.is_dynamic = False + for i, tensor in enumerate(self.session.get_inputs()): + if any(not isinstance(i, int) for i in tensor.shape): + self.is_dynamic = True + inputs_info.append( + Binding(tensor.name, tensor.type, tuple(tensor.shape))) + + for i, tensor in enumerate(self.session.get_outputs()): + outputs_info.append( + Binding(tensor.name, tensor.type, tuple(tensor.shape))) + self.inputs_info = inputs_info + self.outputs_info = outputs_info + self.num_inputs = len(inputs_info) + + def forward(self, *inputs): + + assert len(inputs) == self.num_inputs + + contiguous_inputs: List[np.ndarray] = [ + i.contiguous().cpu().numpy() for i in inputs + ] + + if not self.is_dynamic: + # make sure input shape is right for static input shape + for i in range(self.num_inputs): + assert contiguous_inputs[i].shape == self.inputs_info[i].shape + + outputs = self.session.run([o.name for o in self.outputs_info], { + j.name: contiguous_inputs[i] + for i, j in enumerate(self.inputs_info) + }) + + return tuple(torch.from_numpy(o).to(self.device) for o in outputs) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/model.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c67ed2872097e82d7f569a2f486b1a6463cde986 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/model/model.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy +from functools import partial +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmdet.models.backbones.csp_darknet import Focus +from mmdet.models.layers import ChannelAttention +from mmengine.config import ConfigDict +from torch import Tensor + +from mmyolo.models import RepVGGBlock +from mmyolo.models.dense_heads import (PPYOLOEHead, RTMDetHead, YOLOv5Head, + YOLOv7Head, YOLOv8Head, YOLOXHead) +from mmyolo.models.layers import ImplicitA, ImplicitM +from ..backbone import DeployFocus, GConvFocus, NcnnFocus +from ..bbox_code import (rtmdet_bbox_decoder, yolov5_bbox_decoder, + yolox_bbox_decoder) +from ..nms import batched_nms, efficient_nms, onnx_nms +from .backend import MMYOLOBackend + + +class DeployModel(nn.Module): + transpose = False + + def __init__(self, + baseModel: nn.Module, + backend: MMYOLOBackend, + postprocess_cfg: Optional[ConfigDict] = None): + super().__init__() + self.baseModel = baseModel + self.baseHead = baseModel.bbox_head + self.backend = backend + if postprocess_cfg is None: + self.with_postprocess = False + else: + self.with_postprocess = True + self.__init_sub_attributes() + self.detector_type = type(self.baseHead) + self.pre_top_k = postprocess_cfg.get('pre_top_k', 1000) + self.keep_top_k = postprocess_cfg.get('keep_top_k', 100) + self.iou_threshold = postprocess_cfg.get('iou_threshold', 0.65) + self.score_threshold = postprocess_cfg.get('score_threshold', 0.25) + self.__switch_deploy() + + def __init_sub_attributes(self): + self.bbox_decoder = self.baseHead.bbox_coder.decode + self.prior_generate = self.baseHead.prior_generator.grid_priors + self.num_base_priors = self.baseHead.num_base_priors + self.featmap_strides = self.baseHead.featmap_strides + self.num_classes = self.baseHead.num_classes + + def __switch_deploy(self): + headType = type(self.baseHead) + if not self.with_postprocess: + if headType in (YOLOv5Head, YOLOv7Head): + self.baseHead.head_module.forward_single = self.forward_single + elif headType in (PPYOLOEHead, YOLOv8Head): + self.baseHead.head_module.reg_max = 0 + + if self.backend in (MMYOLOBackend.HORIZONX3, MMYOLOBackend.NCNN, + MMYOLOBackend.TORCHSCRIPT): + self.transpose = True + for layer in self.baseModel.modules(): + if isinstance(layer, RepVGGBlock): + layer.switch_to_deploy() + elif isinstance(layer, ChannelAttention): + layer.global_avgpool.forward = self.forward_gvp + elif isinstance(layer, Focus): + # onnxruntime openvino tensorrt8 tensorrt7 + if self.backend in (MMYOLOBackend.ONNXRUNTIME, + MMYOLOBackend.OPENVINO, + MMYOLOBackend.TENSORRT8, + MMYOLOBackend.TENSORRT7): + self.baseModel.backbone.stem = DeployFocus(layer) + # ncnn + elif self.backend == MMYOLOBackend.NCNN: + self.baseModel.backbone.stem = NcnnFocus(layer) + # switch focus to group conv + else: + self.baseModel.backbone.stem = GConvFocus(layer) + + def pred_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + **kwargs): + assert len(cls_scores) == len(bbox_preds) + dtype = cls_scores[0].dtype + device = cls_scores[0].device + + nms_func = self.select_nms() + if self.detector_type in (YOLOv5Head, YOLOv7Head): + bbox_decoder = yolov5_bbox_decoder + elif self.detector_type is RTMDetHead: + bbox_decoder = rtmdet_bbox_decoder + elif self.detector_type is YOLOXHead: + bbox_decoder = yolox_bbox_decoder + else: + bbox_decoder = self.bbox_decoder + + num_imgs = cls_scores[0].shape[0] + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + mlvl_priors = self.prior_generate( + featmap_sizes, dtype=dtype, device=device) + + flatten_priors = torch.cat(mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size[0] * featmap_size[1] * self.num_base_priors, ), + stride) for featmap_size, stride in zip( + featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + + if objectnesses is not None: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1)) + + scores = cls_scores + + bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds, + flatten_stride) + + return nms_func(bboxes, scores, self.keep_top_k, self.iou_threshold, + self.score_threshold, self.pre_top_k, self.keep_top_k) + + def select_nms(self): + if self.backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO): + nms_func = onnx_nms + elif self.backend == MMYOLOBackend.TENSORRT8: + nms_func = efficient_nms + elif self.backend == MMYOLOBackend.TENSORRT7: + nms_func = batched_nms + else: + raise NotImplementedError + if type(self.baseHead) in (YOLOv5Head, YOLOv7Head, YOLOXHead): + nms_func = partial(nms_func, box_coding=1) + + return nms_func + + def forward(self, inputs: Tensor): + neck_outputs = self.baseModel(inputs) + if self.with_postprocess: + return self.pred_by_feat(*neck_outputs) + else: + outputs = [] + if self.transpose: + for feats in zip(*neck_outputs): + if self.backend in (MMYOLOBackend.NCNN, + MMYOLOBackend.TORCHSCRIPT): + outputs.append( + torch.cat( + [feat.permute(0, 2, 3, 1) for feat in feats], + -1)) + else: + outputs.append(torch.cat(feats, 1).permute(0, 2, 3, 1)) + else: + for feats in zip(*neck_outputs): + outputs.append(torch.cat(feats, 1)) + return tuple(outputs) + + @staticmethod + def forward_single(x: Tensor, convs: nn.Module) -> Tuple[Tensor]: + if isinstance(convs, nn.Sequential) and any( + type(m) in (ImplicitA, ImplicitM) for m in convs): + a, c, m = convs + aw = a.implicit.clone() + mw = m.implicit.clone() + c = deepcopy(c) + nw, cw, _, _ = c.weight.shape + na, ca, _, _ = aw.shape + nm, cm, _, _ = mw.shape + c.bias = nn.Parameter(c.bias + ( + c.weight.reshape(nw, cw) @ aw.reshape(ca, na)).squeeze(1)) + c.bias = nn.Parameter(c.bias * mw.reshape(cm)) + c.weight = nn.Parameter(c.weight * mw.transpose(0, 1)) + convs = c + feat = convs(x) + return (feat, ) + + @staticmethod + def forward_gvp(x: Tensor) -> Tensor: + return torch.mean(x, [2, 3], keepdim=True) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/nms/__init__.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/nms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..59c5cdbd2b3b195125a14f473b825f616755fd6e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/nms/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ort_nms import onnx_nms +from .trt_nms import batched_nms, efficient_nms + +__all__ = ['efficient_nms', 'batched_nms', 'onnx_nms'] diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/nms/ort_nms.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/nms/ort_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..aad93cf05ac2ee9d61a85b4bf9e7b63c352859ec --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/nms/ort_nms.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0], + [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]], + dtype=torch.float32) + + +def select_nms_index(scores: Tensor, + boxes: Tensor, + nms_index: Tensor, + batch_size: int, + keep_top_k: int = -1): + batch_inds, cls_inds = nms_index[:, 0], nms_index[:, 1] + box_inds = nms_index[:, 2] + + scores = scores[batch_inds, cls_inds, box_inds].unsqueeze(1) + boxes = boxes[batch_inds, box_inds, ...] + dets = torch.cat([boxes, scores], dim=1) + + batched_dets = dets.unsqueeze(0).repeat(batch_size, 1, 1) + batch_template = torch.arange( + 0, batch_size, dtype=batch_inds.dtype, device=batch_inds.device) + batched_dets = batched_dets.where( + (batch_inds == batch_template.unsqueeze(1)).unsqueeze(-1), + batched_dets.new_zeros(1)) + + batched_labels = cls_inds.unsqueeze(0).repeat(batch_size, 1) + batched_labels = batched_labels.where( + (batch_inds == batch_template.unsqueeze(1)), + batched_labels.new_ones(1) * -1) + + N = batched_dets.shape[0] + + batched_dets = torch.cat((batched_dets, batched_dets.new_zeros((N, 1, 5))), + 1) + batched_labels = torch.cat((batched_labels, -batched_labels.new_ones( + (N, 1))), 1) + + _, topk_inds = batched_dets[:, :, -1].sort(dim=1, descending=True) + topk_batch_inds = torch.arange( + batch_size, dtype=topk_inds.dtype, + device=topk_inds.device).view(-1, 1) + batched_dets = batched_dets[topk_batch_inds, topk_inds, ...] + batched_labels = batched_labels[topk_batch_inds, topk_inds, ...] + batched_dets, batched_scores = batched_dets.split([4, 1], 2) + batched_scores = batched_scores.squeeze(-1) + + num_dets = (batched_scores > 0).sum(1, keepdim=True) + return num_dets, batched_dets, batched_scores, batched_labels + + +class ONNXNMSop(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: Tensor = torch.tensor([100]), + iou_threshold: Tensor = torch.tensor([0.5]), + score_threshold: Tensor = torch.tensor([0.05]) + ) -> Tensor: + device = boxes.device + batch = scores.shape[0] + num_det = 20 + batches = torch.randint(0, batch, (num_det, )).sort()[0].to(device) + idxs = torch.arange(100, 100 + num_det).to(device) + zeros = torch.zeros((num_det, ), dtype=torch.int64).to(device) + selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], + 0).T.contiguous() + selected_indices = selected_indices.to(torch.int64) + + return selected_indices + + @staticmethod + def symbolic( + g, + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: Tensor = torch.tensor([100]), + iou_threshold: Tensor = torch.tensor([0.5]), + score_threshold: Tensor = torch.tensor([0.05]), + ): + return g.op( + 'NonMaxSuppression', + boxes, + scores, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + outputs=1) + + +def onnx_nms( + boxes: torch.Tensor, + scores: torch.Tensor, + max_output_boxes_per_class: int = 100, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + max_output_boxes_per_class = torch.tensor([max_output_boxes_per_class]) + iou_threshold = torch.tensor([iou_threshold]) + score_threshold = torch.tensor([score_threshold]) + + batch_size, _, _ = scores.shape + if box_coding == 1: + boxes = boxes @ (_XYWH2XYXY.to(boxes.device)) + scores = scores.transpose(1, 2).contiguous() + selected_indices = ONNXNMSop.apply(boxes, scores, + max_output_boxes_per_class, + iou_threshold, score_threshold) + + num_dets, batched_dets, batched_scores, batched_labels = select_nms_index( + scores, boxes, selected_indices, batch_size, keep_top_k=keep_top_k) + + return num_dets, batched_dets, batched_scores, batched_labels.to( + torch.int32) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/nms/trt_nms.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/nms/trt_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..e0db1e2164d4366ff9ce4f74d39ded917c39ba79 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/nms/trt_nms.py @@ -0,0 +1,226 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0], + [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]], + dtype=torch.float32) + + +class TRTEfficientNMSop(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + background_class: int = -1, + box_coding: int = 0, + iou_threshold: float = 0.45, + max_output_boxes: int = 100, + plugin_version: str = '1', + score_activation: int = 0, + score_threshold: float = 0.25, + ): + batch_size, _, num_classes = scores.shape + num_det = torch.randint( + 0, max_output_boxes, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, max_output_boxes, 4) + det_scores = torch.randn(batch_size, max_output_boxes) + det_classes = torch.randint( + 0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic(g, + boxes: Tensor, + scores: Tensor, + background_class: int = -1, + box_coding: int = 0, + iou_threshold: float = 0.45, + max_output_boxes: int = 100, + plugin_version: str = '1', + score_activation: int = 0, + score_threshold: float = 0.25): + out = g.op( + 'TRT::EfficientNMS_TRT', + boxes, + scores, + background_class_i=background_class, + box_coding_i=box_coding, + iou_threshold_f=iou_threshold, + max_output_boxes_i=max_output_boxes, + plugin_version_s=plugin_version, + score_activation_i=score_activation, + score_threshold_f=score_threshold, + outputs=4) + num_det, det_boxes, det_scores, det_classes = out + return num_det, det_boxes, det_scores, det_classes + + +class TRTbatchedNMSop(torch.autograd.Function): + """TensorRT NMS operation.""" + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + plugin_version: str = '1', + shareLocation: int = 1, + backgroundLabelId: int = -1, + numClasses: int = 80, + topK: int = 1000, + keepTopK: int = 100, + scoreThreshold: float = 0.25, + iouThreshold: float = 0.45, + isNormalized: int = 0, + clipBoxes: int = 0, + scoreBits: int = 16, + caffeSemantics: int = 1, + ): + batch_size, _, numClasses = scores.shape + num_det = torch.randint( + 0, keepTopK, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, keepTopK, 4) + det_scores = torch.randn(batch_size, keepTopK) + det_classes = torch.randint(0, numClasses, + (batch_size, keepTopK)).float() + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic( + g, + boxes: Tensor, + scores: Tensor, + plugin_version: str = '1', + shareLocation: int = 1, + backgroundLabelId: int = -1, + numClasses: int = 80, + topK: int = 1000, + keepTopK: int = 100, + scoreThreshold: float = 0.25, + iouThreshold: float = 0.45, + isNormalized: int = 0, + clipBoxes: int = 0, + scoreBits: int = 16, + caffeSemantics: int = 1, + ): + out = g.op( + 'TRT::BatchedNMSDynamic_TRT', + boxes, + scores, + shareLocation_i=shareLocation, + plugin_version_s=plugin_version, + backgroundLabelId_i=backgroundLabelId, + numClasses_i=numClasses, + topK_i=topK, + keepTopK_i=keepTopK, + scoreThreshold_f=scoreThreshold, + iouThreshold_f=iouThreshold, + isNormalized_i=isNormalized, + clipBoxes_i=clipBoxes, + scoreBits_i=scoreBits, + caffeSemantics_i=caffeSemantics, + outputs=4) + num_det, det_boxes, det_scores, det_classes = out + return num_det, det_boxes, det_scores, det_classes + + +def _efficient_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x1, y1 ,x2, y2]. + Set to 1 means [x, y, w, h]. + Returns: + tuple[Tensor, Tensor, Tensor, Tensor]: + (num_det, det_boxes, det_scores, det_classes), + `num_det` of shape [N, 1] + `det_boxes` of shape [N, num_det, 4] + `det_scores` of shape [N, num_det] + `det_classes` of shape [N, num_det] + """ + num_det, det_boxes, det_scores, det_classes = TRTEfficientNMSop.apply( + boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0, + score_threshold) + return num_det, det_boxes, det_scores, det_classes + + +def _batched_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x1, y1 ,x2, y2]. + Set to 1 means [x, y, w, h]. + Returns: + tuple[Tensor, Tensor, Tensor, Tensor]: + (num_det, det_boxes, det_scores, det_classes), + `num_det` of shape [N, 1] + `det_boxes` of shape [N, num_det, 4] + `det_scores` of shape [N, num_det] + `det_classes` of shape [N, num_det] + """ + if box_coding == 1: + boxes = boxes @ (_XYWH2XYXY.to(boxes.device)) + boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2) + _, _, numClasses = scores.shape + + num_det, det_boxes, det_scores, det_classes = TRTbatchedNMSop.apply( + boxes, scores, '1', 1, -1, int(numClasses), min(pre_top_k, 4096), + keep_top_k, score_threshold, iou_threshold, 0, 0, 16, 1) + + det_classes = det_classes.int() + return num_det, det_boxes, det_scores, det_classes + + +def efficient_nms(*args, **kwargs): + """Wrapper function for `_efficient_nms`.""" + return _efficient_nms(*args, **kwargs) + + +def batched_nms(*args, **kwargs): + """Wrapper function for `_batched_nms`.""" + return _batched_nms(*args, **kwargs) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/tools/build_engine.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/tools/build_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..b400c9db826878a7bb0fb13f4b1dea9b793583e7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/tools/build_engine.py @@ -0,0 +1,136 @@ +import argparse +from pathlib import Path +from typing import List, Optional, Tuple, Union + +try: + import tensorrt as trt +except Exception: + trt = None +import warnings + +import numpy as np +import torch + +warnings.filterwarnings(action='ignore', category=DeprecationWarning) + + +class EngineBuilder: + + def __init__( + self, + checkpoint: Union[str, Path], + opt_shape: Union[Tuple, List] = (1, 3, 640, 640), + device: Optional[Union[str, int, torch.device]] = None) -> None: + checkpoint = Path(checkpoint) if isinstance(checkpoint, + str) else checkpoint + assert checkpoint.exists() and checkpoint.suffix == '.onnx' + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + + self.checkpoint = checkpoint + self.opt_shape = np.array(opt_shape, dtype=np.float32) + self.device = device + + def __build_engine(self, + scale: Optional[List[List]] = None, + fp16: bool = True, + with_profiling: bool = True) -> None: + logger = trt.Logger(trt.Logger.WARNING) + trt.init_libnvinfer_plugins(logger, namespace='') + builder = trt.Builder(logger) + config = builder.create_builder_config() + config.max_workspace_size = torch.cuda.get_device_properties( + self.device).total_memory + flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + network = builder.create_network(flag) + parser = trt.OnnxParser(network, logger) + if not parser.parse_from_file(str(self.checkpoint)): + raise RuntimeError( + f'failed to load ONNX file: {str(self.checkpoint)}') + inputs = [network.get_input(i) for i in range(network.num_inputs)] + outputs = [network.get_output(i) for i in range(network.num_outputs)] + profile = None + dshape = -1 in network.get_input(0).shape + if dshape: + profile = builder.create_optimization_profile() + if scale is None: + scale = np.array( + [[1, 1, 0.5, 0.5], [1, 1, 1, 1], [4, 1, 1.5, 1.5]], + dtype=np.float32) + scale = (self.opt_shape * scale).astype(np.int32) + elif isinstance(scale, List): + scale = np.array(scale, dtype=np.int32) + assert scale.shape[0] == 3, 'Input a wrong scale list' + else: + raise NotImplementedError + + for inp in inputs: + logger.log( + trt.Logger.WARNING, + f'input "{inp.name}" with shape{inp.shape} {inp.dtype}') + if dshape: + profile.set_shape(inp.name, *scale) + for out in outputs: + logger.log( + trt.Logger.WARNING, + f'output "{out.name}" with shape{out.shape} {out.dtype}') + if fp16 and builder.platform_has_fast_fp16: + config.set_flag(trt.BuilderFlag.FP16) + self.weight = self.checkpoint.with_suffix('.engine') + if dshape: + config.add_optimization_profile(profile) + if with_profiling: + config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED + with builder.build_engine(network, config) as engine: + self.weight.write_bytes(engine.serialize()) + logger.log( + trt.Logger.WARNING, f'Build tensorrt engine finish.\n' + f'Save in {str(self.weight.absolute())}') + + def build(self, + scale: Optional[List[List]] = None, + fp16: bool = True, + with_profiling=True): + self.__build_engine(scale, fp16, with_profiling) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument( + '--device', type=str, default='cuda:0', help='TensorRT builder device') + parser.add_argument( + '--scales', + type=str, + default='[[1,3,640,640],[1,3,640,640],[1,3,640,640]]', + help='Input scales for build dynamic input shape engine') + parser.add_argument( + '--fp16', action='store_true', help='Build model with fp16 mode') + args = parser.parse_args() + args.img_size *= 2 if len(args.img_size) == 1 else 1 + return args + + +def main(args): + img_size = (1, 3, *args.img_size) + try: + scales = eval(args.scales) + except Exception: + print('Input scales is not a python variable') + print('Set scales default None') + scales = None + builder = EngineBuilder(args.checkpoint, img_size, args.device) + builder.build(scales, fp16=args.fp16) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/tools/export_onnx.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/tools/export_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..b937cc8a72b5c09d61580ddb1297213693adaf1c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/tools/export_onnx.py @@ -0,0 +1,157 @@ +import argparse +import os +import sys +import warnings +from io import BytesIO +from pathlib import Path + +import onnx +import torch +from mmdet.apis import init_detector +from mmengine.config import ConfigDict +from mmengine.logging import print_log +from mmengine.utils.path import mkdir_or_exist + +# Add MMYOLO ROOT to sys.path +sys.path.append(str(Path(__file__).resolve().parents[3])) +from projects.easydeploy.model import DeployModel, MMYOLOBackend # noqa E402 + +warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning) +warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning) +warnings.filterwarnings(action='ignore', category=UserWarning) +warnings.filterwarnings(action='ignore', category=FutureWarning) +warnings.filterwarnings(action='ignore', category=ResourceWarning) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--model-only', action='store_true', help='Export model only') + parser.add_argument( + '--work-dir', default='./work_dir', help='Path to save export model') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument('--batch-size', type=int, default=1, help='Batch size') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--simplify', + action='store_true', + help='Simplify onnx model by onnx-sim') + parser.add_argument( + '--opset', type=int, default=11, help='ONNX opset version') + parser.add_argument( + '--backend', + type=str, + default='onnxruntime', + help='Backend for export onnx') + parser.add_argument( + '--pre-topk', + type=int, + default=1000, + help='Postprocess pre topk bboxes feed into NMS') + parser.add_argument( + '--keep-topk', + type=int, + default=100, + help='Postprocess keep topk bboxes out of NMS') + parser.add_argument( + '--iou-threshold', + type=float, + default=0.65, + help='IoU threshold for NMS') + parser.add_argument( + '--score-threshold', + type=float, + default=0.25, + help='Score threshold for NMS') + args = parser.parse_args() + args.img_size *= 2 if len(args.img_size) == 1 else 1 + return args + + +def build_model_from_cfg(config_path, checkpoint_path, device): + model = init_detector(config_path, checkpoint_path, device=device) + model.eval() + return model + + +def main(): + args = parse_args() + mkdir_or_exist(args.work_dir) + backend = MMYOLOBackend(args.backend.lower()) + if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO, + MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7): + if not args.model_only: + print_log('Export ONNX with bbox decoder and NMS ...') + else: + args.model_only = True + print_log(f'Can not export postprocess for {args.backend.lower()}.\n' + f'Set "args.model_only=True" default.') + if args.model_only: + postprocess_cfg = None + output_names = None + else: + postprocess_cfg = ConfigDict( + pre_top_k=args.pre_topk, + keep_top_k=args.keep_topk, + iou_threshold=args.iou_threshold, + score_threshold=args.score_threshold) + output_names = ['num_dets', 'boxes', 'scores', 'labels'] + baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device) + + deploy_model = DeployModel( + baseModel=baseModel, backend=backend, postprocess_cfg=postprocess_cfg) + deploy_model.eval() + + fake_input = torch.randn(args.batch_size, 3, + *args.img_size).to(args.device) + # dry run + deploy_model(fake_input) + + save_onnx_path = os.path.join( + args.work_dir, + os.path.basename(args.checkpoint).replace('pth', 'onnx')) + # export onnx + with BytesIO() as f: + torch.onnx.export( + deploy_model, + fake_input, + f, + input_names=['images'], + output_names=output_names, + opset_version=args.opset) + f.seek(0) + onnx_model = onnx.load(f) + onnx.checker.check_model(onnx_model) + + # Fix tensorrt onnx output shape, just for view + if not args.model_only and backend in (MMYOLOBackend.TENSORRT8, + MMYOLOBackend.TENSORRT7): + shapes = [ + args.batch_size, 1, args.batch_size, args.keep_topk, 4, + args.batch_size, args.keep_topk, args.batch_size, + args.keep_topk + ] + for i in onnx_model.graph.output: + for j in i.type.tensor_type.shape.dim: + j.dim_param = str(shapes.pop(0)) + if args.simplify: + try: + import onnxsim + onnx_model, check = onnxsim.simplify(onnx_model) + assert check, 'assert check failed' + except Exception as e: + print_log(f'Simplify failure: {e}') + onnx.save(onnx_model, save_onnx_path) + print_log(f'ONNX export success, save into {save_onnx_path}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/projects/easydeploy/tools/image-demo.py b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/tools/image-demo.py new file mode 100644 index 0000000000000000000000000000000000000000..c85f31a02beeb708e23662fe08dd0a105f112aaf --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/easydeploy/tools/image-demo.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from projects.easydeploy.model import ORTWrapper, TRTWrapper # isort:skip +import os +import random +from argparse import ArgumentParser + +import cv2 +import mmcv +import numpy as np +import torch +from mmcv.transforms import Compose +from mmdet.utils import get_test_pipeline_cfg +from mmengine.config import Config, ConfigDict +from mmengine.utils import ProgressBar, path + +from mmyolo.utils import register_all_modules +from mmyolo.utils.misc import get_file_list + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + args = parser.parse_args() + return args + + +def preprocess(config): + data_preprocess = config.get('model', {}).get('data_preprocessor', {}) + mean = data_preprocess.get('mean', [0., 0., 0.]) + std = data_preprocess.get('std', [1., 1., 1.]) + mean = torch.tensor(mean, dtype=torch.float32).reshape(1, 3, 1, 1) + std = torch.tensor(std, dtype=torch.float32).reshape(1, 3, 1, 1) + + class PreProcess(torch.nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x): + x = x[None].float() + x -= mean.to(x.device) + x /= std.to(x.device) + return x + + return PreProcess().eval() + + +def main(): + args = parse_args() + + # register all modules in mmdet into the registries + register_all_modules() + + colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(1000)] + + # build the model from a config file and a checkpoint file + if args.checkpoint.endswith('.onnx'): + model = ORTWrapper(args.checkpoint, args.device) + elif args.checkpoint.endswith('.engine') or args.checkpoint.endswith( + '.plan'): + model = TRTWrapper(args.checkpoint, args.device) + else: + raise NotImplementedError + + model.to(args.device) + + cfg = Config.fromfile(args.config) + class_names = cfg.get('class_name') + + test_pipeline = get_test_pipeline_cfg(cfg) + test_pipeline[0] = ConfigDict({'type': 'mmdet.LoadImageFromNDArray'}) + test_pipeline = Compose(test_pipeline) + + pre_pipeline = preprocess(cfg) + + if not args.show: + path.mkdir_or_exist(args.out_dir) + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + progress_bar = ProgressBar(len(files)) + for i, file in enumerate(files): + bgr = mmcv.imread(file) + rgb = mmcv.imconvert(bgr, 'bgr', 'rgb') + data, samples = test_pipeline(dict(img=rgb, img_id=i)).values() + pad_param = samples.get('pad_param', + np.array([0, 0, 0, 0], dtype=np.float32)) + h, w = samples.get('ori_shape', rgb.shape[:2]) + pad_param = torch.asarray( + [pad_param[2], pad_param[0], pad_param[2], pad_param[0]], + device=args.device) + scale_factor = samples.get('scale_factor', [1., 1]) + scale_factor = torch.asarray(scale_factor * 2, device=args.device) + data = pre_pipeline(data).to(args.device) + + result = model(data) + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # Get candidate predict info by num_dets + num_dets, bboxes, scores, labels = result + scores = scores[0, :num_dets] + bboxes = bboxes[0, :num_dets] + labels = labels[0, :num_dets] + bboxes -= pad_param + bboxes /= scale_factor + + bboxes[:, 0::2].clamp_(0, w) + bboxes[:, 1::2].clamp_(0, h) + bboxes = bboxes.round().int() + + for (bbox, score, label) in zip(bboxes, scores, labels): + bbox = bbox.tolist() + color = colors[label] + + if class_names is not None: + label_name = class_names[label] + name = f'cls:{label_name}_score:{score:0.4f}' + else: + name = f'cls:{label}_score:{score:0.4f}' + + cv2.rectangle(bgr, bbox[:2], bbox[2:], color, 2) + cv2.putText( + bgr, + name, (bbox[0], bbox[1] - 2), + cv2.FONT_HERSHEY_SIMPLEX, + 2.0, [225, 255, 255], + thickness=3) + + if args.show: + mmcv.imshow(bgr, 'result', 0) + else: + mmcv.imwrite(bgr, out_file) + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/projects/example_project/README.md b/models/YOLO-World/third_party/mmyolo/projects/example_project/README.md new file mode 100644 index 0000000000000000000000000000000000000000..24c84d9808aa4a78294aa23058083e0de80de62e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/example_project/README.md @@ -0,0 +1,141 @@ +# Dummy YOLOv5CSPDarknet Wrapper + +This is an example README for community `projects/`. We have provided detailed explanations for each field in the form of html comments, which are visible when you read the source of this README file. If you wish to submit your project to our main repository, then all the fields in this README are mandatory for others to understand what you have achieved in this implementation. For more details, read our [contribution guide](https://mmyolo.readthedocs.io/en/latest/community/contributing.html) or approach us in [Discussions](https://github.com/open-mmlab/mmyolo/discussions). + +## Description + + + +This project implements a dummy YOLOv5CSPDarknet wrapper, which literally does nothing new but prints "hello world" during initialization. + +## Usage + + + +### Training commands + +In MMYOLO's root directory, run the following command to train the model: + +```bash +python tools/train.py projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py +``` + +### Testing commands + +In MMYOLO's root directory, run the following command to test the model: + +```bash +python tools/test.py projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py ${CHECKPOINT_PATH} +``` + +## Results + + + +| Method | Backbone | Pretrained Model | Training set | Test set | #epoch | box AP | Download | +| :---------------------------------------------------------------------------: | :-------------------: | :--------------: | :------------: | :----------: | :----: | :----: | :----------------------: | +| [YOLOv5 dummy](configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py) | DummyYOLOv5CSPDarknet | - | COCO2017 Train | COCO2017 Val | 300 | 37.7 | [model](<>) \| [log](<>) | + +## Citation + + + +```latex +@software{glenn_jocher_2022_7002879, + author = {Glenn Jocher and + Ayush Chaurasia and + Alex Stoken and + Jirka Borovec and + NanoCode012 and + Yonghye Kwon and + TaoXie and + Kalen Michael and + Jiacong Fang and + imyhxy and + Lorna and + Colin Wong and + 曾逸夫(Zeng Yifu) and + Abhiram V and + Diego Montes and + Zhiqiang Wang and + Cristi Fati and + Jebastin Nadar and + Laughing and + UnglvKitDe and + tkianai and + yxNONG and + Piotr Skalski and + Adam Hogan and + Max Strobel and + Mrinal Jain and + Lorenzo Mammana and + xylieong}, + title = {{ultralytics/yolov5: v6.2 - YOLOv5 Classification + Models, Apple M1, Reproducibility, ClearML and + Deci.ai integrations}}, + month = aug, + year = 2022, + publisher = {Zenodo}, + version = {v6.2}, + doi = {10.5281/zenodo.7002879}, + url = {https://doi.org/10.5281/zenodo.7002879} +} +``` + +## Checklist + + + +- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [ ] Finish the code + + + + - [ ] Basic docstrings & proper citation + + + + - [ ] Test-time correctness + + + + - [ ] A full README + + + +- [ ] Milestone 2: Indicates a successful model implementation. + + - [ ] Training-time correctness + + + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Type hints and docstrings + + + + - [ ] Unit tests + + + + - [ ] Code polishing + + + + - [ ] Metafile.yml + + + +- [ ] Move your modules into the core package following the codebase's file hierarchy structure. + + + +- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/models/YOLO-World/third_party/mmyolo/projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py b/models/YOLO-World/third_party/mmyolo/projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..55b43bb3e97a20b4b9f98d5bc297bf8ef375da8e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py @@ -0,0 +1,5 @@ +_base_ = '../../../configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +custom_imports = dict(imports=['projects.example_project.dummy']) + +_base_.model.backbone.type = 'DummyYOLOv5CSPDarknet' diff --git a/models/YOLO-World/third_party/mmyolo/projects/example_project/dummy/__init__.py b/models/YOLO-World/third_party/mmyolo/projects/example_project/dummy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ca1028c8735be8ece5942d0ca64b69a8da16ed82 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/example_project/dummy/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .dummy_yolov5cspdarknet import DummyYOLOv5CSPDarknet + +__all__ = ['DummyYOLOv5CSPDarknet'] diff --git a/models/YOLO-World/third_party/mmyolo/projects/example_project/dummy/dummy_yolov5cspdarknet.py b/models/YOLO-World/third_party/mmyolo/projects/example_project/dummy/dummy_yolov5cspdarknet.py new file mode 100644 index 0000000000000000000000000000000000000000..c500abb4278581af99d6a190fd7694ffdd08117c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/example_project/dummy/dummy_yolov5cspdarknet.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmyolo.models import YOLOv5CSPDarknet +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class DummyYOLOv5CSPDarknet(YOLOv5CSPDarknet): + """Implements a dummy YOLOv5CSPDarknet wrapper for demonstration purpose. + Args: + **kwargs: All the arguments are passed to the parent class. + """ + + def __init__(self, **kwargs) -> None: + print('Hello world!') + super().__init__(**kwargs) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/README.md b/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e98fa730241aee0d54fea62fb752ab4eb901f0a0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/README.md @@ -0,0 +1,3 @@ +Tips: 这个是自定义数据集的 config 文件,请结合 [标注+训练+测试+部署全流程](https://github.com/open-mmlab/mmyolo/blob/main/docs/zh_cn/recommended_topics/labeling_to_deployment_tutorials.md) 来使用。 + +Tips: This is the config file of the custom dataset. Please use it in combination with [Annotation-to-deployment workflow for custom dataset](https://github.com/open-mmlab/mmyolo/blob/main/docs/en/recommended_topics/labeling_to_deployment_tutorials.md). diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py b/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..1d6a9d3b0f5ecf9ff7a46202d50b733810d93124 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py @@ -0,0 +1,76 @@ +_base_ = '../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +max_epochs = 100 +data_root = './data/cat/' +# data_root = '/root/workspace/mmyolo/data/cat/' # Docker + +work_dir = './work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +train_batch_size_per_gpu = 32 +train_num_workers = 4 + +save_epoch_intervals = 2 + +# base_lr_default * (your_bs / default_bs) +base_lr = _base_.base_lr / 4 + +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(220, 20, 60)]) + +train_cfg = dict( + max_epochs=max_epochs, val_begin=20, val_interval=save_epoch_intervals) + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict(loss_weight=0.5 * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=5, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=10)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py b/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..67d5638aae7532efb60bd608f2a976d8991503b8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py @@ -0,0 +1,85 @@ +_base_ = '../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +max_epochs = 100 +data_root = './data/cat/' + +work_dir = './work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth' # noqa + +train_batch_size_per_gpu = 32 +train_num_workers = 4 # train_num_workers = nGPU x 4 + +save_epoch_intervals = 2 + +# base_lr_default * (your_bs / default_bs) +base_lr = _base_.base_lr / 8 + +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(220, 20, 60)]) + +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - _base_.num_last_epochs, 1)]) + +model = dict( + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=5, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=10)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - _base_.num_last_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/yolov7_tiny_syncbn_fast_1xb32-100e_cat.py b/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/yolov7_tiny_syncbn_fast_1xb32-100e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..fff59cb3d31f002724b11674bb8c1550220be503 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/custom_dataset/yolov7_tiny_syncbn_fast_1xb32-100e_cat.py @@ -0,0 +1,78 @@ +_base_ = '../yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py' + +max_epochs = 100 +data_root = './data/cat/' + +work_dir = './work_dirs/yolov7_tiny_syncbn_fast_1xb32-100e_cat' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth' # noqa + +train_batch_size_per_gpu = 32 +train_num_workers = 4 # train_num_workers = nGPU x 4 + +save_epoch_intervals = 2 + +# base_lr_default * (your_bs / default_bs) +base_lr = 0.01 / 4 + +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(220, 20, 60)]) + +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - 10, 1)]) + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict(loss_weight=0.5 * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=2, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=10)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/README.md b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb7ddd580fb4e2872e54b9eade49a25b83211159 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/README.md @@ -0,0 +1,3 @@ +Tips: 这是 MMYOLO 应用范例的配置文件,请结合 [基于 MMYOLO 的频高图实时目标检测 benchmark](/docs/zh_cn/recommended_topics/application_examples/ionogram_detection.md) 来使用。 + +Tips: This is the config file of the MMYOLO application examples. Please use it in combination with [A Benchmark for Ionogram Detection Based on MMYOLO](/docs/en/recommended_topics/application_examples/ionogram_detection.md). diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_l_fast_1xb32-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_l_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..f1829eebf93e0dd8480819ef7710b94c2f3c24f5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_l_fast_1xb32-100e_ionogram.py @@ -0,0 +1,107 @@ +_base_ = 'mmyolo::rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py' + +# ======================== Modified parameters ====================== +# -----data related----- +data_root = './Iono4311/' +train_ann_file = 'annotations/train.json' +train_data_prefix = 'train_images/' +val_ann_file = 'annotations/val.json' +val_data_prefix = 'val_images/' +test_ann_file = 'annotations/test.json' +test_data_prefix = 'test_images/' + +class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F') +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123), + (130, 20, 12), (120, 121, 80)]) + +train_batch_size_per_gpu = 32 +train_num_workers = 8 +val_batch_size_per_gpu = train_batch_size_per_gpu + +# Config of batch shapes. Only on val. +batch_shapes_cfg = dict(batch_size=val_batch_size_per_gpu) + +# -----train val related----- +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth' # noqa + +# default hooks +save_epoch_intervals = 10 +max_epochs = 100 +max_keep_ckpts = 1 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=1.0e-5, by_epoch=False, begin=0, + end=300), + dict( + # use cosine lr from 20 to 100 epoch + type='CosineAnnealingLR', + eta_min=_base_.base_lr * 0.05, + begin=max_epochs // 5, + end=max_epochs, + T_max=max_epochs * 4 // 5, + by_epoch=True, + convert_to_iter_based=True), +] + +# train_cfg +val_interval = 2 +val_begin = 20 + +tta_model = None +tta_pipeline = None + +visualizer = dict( + vis_backends=[dict(type='LocalVisBackend'), + dict(type='WandbVisBackend')]) + +# ===================== Unmodified in most cases ================== +model = dict( + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict(assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix))) + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) + +test_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + data_prefix=dict(img=test_data_prefix), + ann_file=test_ann_file)) + +default_hooks = dict( + checkpoint=dict( + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto')) + +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = dict(ann_file=data_root + test_ann_file) + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=val_begin, + val_interval=val_interval) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..49b284b09a0c5605d59c2e332f9894aadaf3d483 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py @@ -0,0 +1,83 @@ +_base_ = './rtmdet_l_fast_1xb32-100e_ionogram.py' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth' # noqa + +# ======================= Modified parameters ===================== +deepen_factor = 0.33 +widen_factor = 0.5 +img_scale = _base_.img_scale + +# ratio range for random resize +random_resize_ratio_range = (0.5, 2.0) +# Number of cached images in mosaic +mosaic_max_cached_images = 40 +# Number of cached images in mixup +mixup_max_cached_images = 20 + +# ===================== Unmodified in most cases ================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + max_cached_images=mixup_max_cached_images), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=random_resize_ratio_range, # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..acdaa0756c5df4e3aff3391651ab737c6632da44 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py @@ -0,0 +1,62 @@ +_base_ = './rtmdet_s_fast_1xb32-100e_ionogram.py' + +# ======================= Modified parameters ====================== +deepen_factor = 0.167 +widen_factor = 0.375 +img_scale = _base_.img_scale + +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth' # noqa + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=1.0e-5, by_epoch=False, begin=0, + end=300), + dict( + # use cosine lr from 50 to 100 epoch + type='CosineAnnealingLR', + eta_min=_base_.base_lr * 0.05, + begin=_base_.max_epochs // 2, + end=_base_.max_epochs, + T_max=_base_.max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=20, # note + random_pop=False, # note + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=(0.5, 2.0), + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + random_pop=False, + max_cached_images=10, + prob=0.5), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..737aeae9abeaee0e0024f04f4d7bfbeb9d8798a6 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py @@ -0,0 +1,95 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# Copied from '../../yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py' +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +affine_scale = 0.9 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 +mixup_prob = 0.1 + +# -----data related----- +train_batch_size_per_gpu = 32 + +# -----train val related----- +# Scale lr for SGD +base_lr = _base_.base_lr * train_batch_size_per_gpu \ + / _base_.train_batch_size_per_gpu +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth' # noqa + +# ===================== Unmodified in most cases ================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + dataset=dict(dataset=dict(pipeline=train_pipeline))) + +val_dataloader = dict(batch_size=train_batch_size_per_gpu) +test_dataloader = dict(batch_size=train_batch_size_per_gpu) +optim_wrapper = dict(optimizer=dict(lr=base_lr)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py new file mode 100644 index 0000000000000000000000000000000000000000..1252ebfca09eb21b1b96d4424c2329855e1b1a40 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py @@ -0,0 +1,35 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----data related----- +train_batch_size_per_gpu = 32 + +# -----train val related----- +base_lr = _base_.base_lr * train_batch_size_per_gpu \ + / _base_.train_batch_size_per_gpu / 2 +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +# ===================== Unmodified in most cases ================== +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + dataset=dict(dataset=dict(pipeline=train_pipeline))) + +val_dataloader = dict(batch_size=train_batch_size_per_gpu) + +test_dataloader = dict(batch_size=train_batch_size_per_gpu) + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..dbe1305d835e8e0a435433deb36ff0d7ce9ec77d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py @@ -0,0 +1,108 @@ +_base_ = 'mmyolo::yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# ======================= Modified parameters ===================== +# -----data related----- +data_root = './Iono4311/' +train_ann_file = 'annotations/train.json' +train_data_prefix = 'train_images/' +val_ann_file = 'annotations/val.json' +val_data_prefix = 'val_images/' +test_ann_file = 'annotations/test.json' +test_data_prefix = 'test_images/' +class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F') +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123), + (130, 20, 12), (120, 121, 80)]) +# Batch size of a single GPU during training +train_batch_size_per_gpu = 96 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 + +# -----model related----- +# Basic size of multi-scale prior box +anchors = [[[8, 6], [24, 4], [19, 9]], [[22, 19], [17, 49], [29, 45]], + [[44, 66], [96, 76], [126, 59]]] + +# -----train val related----- +# base_lr_default * (your_bs / default_bs (8x16)) for SGD +base_lr = _base_.base_lr * train_batch_size_per_gpu / (8 * 16) +max_epochs = 100 +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +# default_hooks +save_epoch_intervals = 10 +logger_interval = 20 +max_keep_ckpts = 1 + +# train_cfg +val_interval = 2 +val_begin = 20 + +tta_model = None +tta_pipeline = None + +visualizer = dict( + vis_backends=[dict(type='LocalVisBackend'), + dict(type='WandbVisBackend')]) + +# ===================== Unmodified in most cases ================== +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict(loss_weight=0.5 * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=1, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img=val_data_prefix))) + +test_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=test_ann_file, + data_prefix=dict(img=test_data_prefix))) + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_param_scheduler=None, # for yolov5 + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=logger_interval)) + +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = dict(ann_file=data_root + test_ann_file) + +train_cfg = dict( + max_epochs=max_epochs, val_begin=val_begin, val_interval=val_interval) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py new file mode 100644 index 0000000000000000000000000000000000000000..39ffb6ba1e110b0ee59136414939164d8e0fe1b5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py @@ -0,0 +1,21 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----train val related----- +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='YOLOv5KeepRatioResize', scale=(640, 640)), + dict( + type='LetterResize', + scale=(640, 640), + allow_scale_up=False, + pad_val=dict(img=114)), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +# ===================== Unmodified in most cases ================== +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py new file mode 100644 index 0000000000000000000000000000000000000000..10c114cbcc1f754d46139157eece5d59666d6649 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py @@ -0,0 +1,29 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----train val related----- +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(0.5, 1.5), + border=(-320, -320), + border_val=(114, 114, 114)), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +# ===================== Unmodified in most cases ================== +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py new file mode 100644 index 0000000000000000000000000000000000000000..df8f6a2c561a67b275abca3cc5ca3763f1527d72 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py @@ -0,0 +1,44 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----train val related----- +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(0.5, 1.5), + border=(-320, -320), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=[ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) + ], + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap=dict(img='image', gt_bboxes='bboxes')), + dict(type='YOLOv5HSVRandomAug'), + # dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +# ===================== Unmodified in most cases ================== +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py new file mode 100644 index 0000000000000000000000000000000000000000..9f62fac92864c1de2d52d3382452a84a16dfe6f8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py @@ -0,0 +1,17 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----train val related----- +base_lr = _base_.base_lr * 4 +max_epochs = 200 +load_from = None +logger_interval = 50 + +train_cfg = dict(max_epochs=max_epochs, ) + +# ===================== Unmodified in most cases ================== +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=logger_interval)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..dc5918d828ddd82ca349a307cb015b7fc29f68f1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py @@ -0,0 +1,29 @@ +_base_ = './yolov6_m_fast_1xb32-100e_ionogram.py' + +# ======================= Modified parameters ======================= +# -----model related----- +deepen_factor = 1 +widen_factor = 1 + +# -----train val related----- +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156-91e3c447.pth' # noqa + +# ====================== Unmodified in most cases =================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + block_act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..00ea8ff055efd5b2094c723cb52118f51d3ce1c6 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py @@ -0,0 +1,63 @@ +_base_ = './yolov6_s_fast_1xb32-100e_ionogram.py' + +# ======================= Modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.6 +# The scaling factor that controls the width of the network structure +widen_factor = 0.75 + +# -----train val related----- +affine_scale = 0.9 # YOLOv5RandomAffine scaling ratio +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth' # noqa + +# ====================== Unmodified in most cases =================== +model = dict( + backbone=dict( + type='YOLOv6CSPBep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=2. / 3, + block_cfg=dict(type='RepVGGBlock'), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6CSPRepPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + block_cfg=dict(type='RepVGGBlock'), + hidden_ratio=2. / 3, + block_act_cfg=dict(type='ReLU', inplace=True)), + bbox_head=dict( + type='YOLOv6Head', head_module=dict(widen_factor=widen_factor))) + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..c9748b408d7a899d96c2852e1f5a9d726187957c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py @@ -0,0 +1,108 @@ +_base_ = 'mmyolo::yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +# ======================= Modified parameters ===================== +# -----data related----- +data_root = './Iono4311/' +train_ann_file = 'annotations/train.json' +train_data_prefix = 'train_images/' +val_ann_file = 'annotations/val.json' +val_data_prefix = 'val_images/' +test_ann_file = 'annotations/test.json' +test_data_prefix = 'test_images/' + +class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F') +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123), + (130, 20, 12), (120, 121, 80)]) + +train_batch_size_per_gpu = 32 +train_num_workers = 8 + +tta_model = None +tta_pipeline = None + +# -----train val related----- +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth' # noqa +# base_lr_default * (your_bs 32 / default_bs (8 x 32)) +base_lr = _base_.base_lr * train_batch_size_per_gpu / (8 * 32) +max_epochs = 100 +save_epoch_intervals = 10 +val_begin = 20 +max_keep_ckpts = 1 +log_interval = 50 +visualizer = dict( + vis_backends=[dict(type='LocalVisBackend'), + dict(type='WandbVisBackend')]) + +# ==================== Unmodified in most cases =================== +train_cfg = dict( + max_epochs=max_epochs, + val_begin=val_begin, + val_interval=save_epoch_intervals, + dynamic_intervals=None) + +model = dict( + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=1, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img=val_data_prefix))) + +test_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=test_ann_file, + data_prefix=dict(img=test_data_prefix))) + +val_evaluator = dict(ann_file=data_root + val_data_prefix) +test_evaluator = dict(ann_file=data_root + test_data_prefix) + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=log_interval)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - _base_.num_last_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py new file mode 100644 index 0000000000000000000000000000000000000000..cc38730f971664bb07edff2a8497e25d4376531f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py @@ -0,0 +1,17 @@ +_base_ = './yolov6_s_fast_1xb32-100e_ionogram.py' + +# ======================= Modified parameters ===================== +base_lr = _base_.base_lr * 4 +optim_wrapper = dict(optimizer=dict(lr=base_lr)) +max_epochs = 200 +load_from = None + +# ==================== Unmodified in most cases =================== +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, +) + +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=50)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_l_fast_1xb16-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_l_fast_1xb16-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..44d58c1f33a12b945c4fafb6f01b521a2e8c6e54 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_l_fast_1xb16-100e_ionogram.py @@ -0,0 +1,98 @@ +_base_ = 'mmyolo::yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py' + +# ======================== Modified parameters ====================== +# -----data related----- +data_root = './Iono4311/' +train_ann_file = 'annotations/train.json' +train_data_prefix = 'train_images/' +val_ann_file = 'annotations/val.json' +val_data_prefix = 'val_images/' +test_ann_file = 'annotations/test.json' +test_data_prefix = 'test_images/' + +class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F') +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123), + (130, 20, 12), (120, 121, 80)]) + +train_batch_size_per_gpu = 16 +train_num_workers = 8 + +# -----model related----- +anchors = [[[14, 14], [35, 6], [32, 18]], [[32, 45], [28, 97], [52, 80]], + [[71, 122], [185, 94], [164, 134]]] + +# -----train val related----- +# base_lr_default * (your_bs 32 / default_bs (8 x 16)) +base_lr = _base_.base_lr * train_batch_size_per_gpu / (8 * 16) +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth' # noqa + +# default hooks +save_epoch_intervals = 10 +max_epochs = 100 +max_keep_ckpts = 1 + +# train_cfg +val_interval = 2 +val_begin = 20 + +tta_model = None +tta_pipeline = None + +visualizer = dict( + vis_backends=[dict(type='LocalVisBackend'), + dict(type='WandbVisBackend')]) + +# ===================== Unmodified in most cases ================== +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict(loss_weight=_base_.loss_cls_weight * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix))) + +val_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) + +test_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + data_prefix=dict(img=test_data_prefix), + ann_file=test_ann_file)) + +optim_wrapper = dict( + optimizer=dict(lr=base_lr, batch_size_per_gpu=train_batch_size_per_gpu)) + +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict( + interval=save_epoch_intervals, max_keep_ckpts=max_keep_ckpts)) + +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = dict(ann_file=data_root + test_ann_file) + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=val_begin, + val_interval=val_interval) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..9c2d63ddeefaa50d3e180c558b1eec2e45180d46 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py @@ -0,0 +1,101 @@ +_base_ = './yolov7_l_fast_1xb16-100e_ionogram.py' + +# ======================== Modified parameters ======================= +# pre-train +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth' # noqa + +# -----model related----- +# Data augmentation +max_translate_ratio = 0.1 # YOLOv5RandomAffine +scaling_ratio_range = (0.5, 1.6) # YOLOv5RandomAffine +mixup_prob = 0.05 # YOLOv5MixUp +randchoice_mosaic_prob = [0.8, 0.2] +mixup_alpha = 8.0 # YOLOv5MixUp +mixup_beta = 8.0 # YOLOv5MixUp + +# -----train val related----- +loss_cls_weight = 0.5 +loss_obj_weight = 1.0 + +lr_factor = 0.01 # Learning rate scaling factor + +# ====================== Unmodified in most cases ==================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +model = dict( + backbone=dict( + arch='Tiny', act_cfg=dict(type='LeakyReLU', negative_slope=0.1)), + neck=dict( + is_tiny_version=True, + in_channels=[128, 256, 512], + out_channels=[64, 128, 256], + block_cfg=dict( + _delete_=True, type='TinyDownSampleBlock', middle_ratio=0.25), + act_cfg=dict(type='LeakyReLU', negative_slope=0.1), + use_repconv_outs=False), + bbox_head=dict( + head_module=dict(in_channels=[128, 256, 512]), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +mosiac4_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # change + scaling_ratio_range=scaling_ratio_range, # change + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +mosiac9_pipeline = [ + dict( + type='Mosaic9', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # change + scaling_ratio_range=scaling_ratio_range, # change + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[mosiac4_pipeline, mosiac9_pipeline], + prob=randchoice_mosaic_prob) + +train_pipeline = [ + *pre_transform, + randchoice_mosaic_pipeline, + dict( + type='YOLOv5MixUp', + alpha=mixup_alpha, + beta=mixup_beta, + prob=mixup_prob, # change + pre_transform=[*pre_transform, randchoice_mosaic_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..606232a6619278e9583276ee89a9c4c340e3e8db --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py @@ -0,0 +1,19 @@ +_base_ = './yolov7_l_fast_1xb16-100e_ionogram.py' + +# ======================== Modified parameters ======================= +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth' # noqa + +# ===================== Unmodified in most cases ================== +model = dict( + backbone=dict(arch='X'), + neck=dict( + in_channels=[640, 1280, 1280], + out_channels=[160, 320, 640], + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2), + use_repconv_outs=False), + bbox_head=dict(head_module=dict(in_channels=[320, 640, 1280]))) diff --git a/models/YOLO-World/third_party/mmyolo/pytest.ini b/models/YOLO-World/third_party/mmyolo/pytest.ini new file mode 100644 index 0000000000000000000000000000000000000000..9796e871e70c7c67345b1d6bcf708c0c82377a98 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +addopts = --xdoctest --xdoctest-style=auto +norecursedirs = .git ignore build __pycache__ data docker docs .eggs + +filterwarnings= default + ignore:.*No cfgstr given in Cacher constructor or call.*:Warning + ignore:.*Define the __nice__ method for.*:Warning diff --git a/models/YOLO-World/third_party/mmyolo/requirements.txt b/models/YOLO-World/third_party/mmyolo/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f50cbdc09d6389264f87e2aa1a576a81990e66a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements.txt @@ -0,0 +1,3 @@ +-r requirements/build.txt +-r requirements/runtime.txt +-r requirements/tests.txt diff --git a/models/YOLO-World/third_party/mmyolo/requirements/albu.txt b/models/YOLO-World/third_party/mmyolo/requirements/albu.txt new file mode 100644 index 0000000000000000000000000000000000000000..2957391ba9d71f694c74257b42e194529c11879f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements/albu.txt @@ -0,0 +1 @@ +albumentations --no-binary qudida,albumentations diff --git a/models/YOLO-World/third_party/mmyolo/requirements/build.txt b/models/YOLO-World/third_party/mmyolo/requirements/build.txt new file mode 100644 index 0000000000000000000000000000000000000000..c96c69aae6a2dfd7d8329707c7a7fe77e0b91f99 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements/build.txt @@ -0,0 +1,3 @@ +# These must be installed before building mmyolo +cython +numpy diff --git a/models/YOLO-World/third_party/mmyolo/requirements/docs.txt b/models/YOLO-World/third_party/mmyolo/requirements/docs.txt new file mode 100644 index 0000000000000000000000000000000000000000..a93a3766c87ffad4c802c323f1a43578d7c8fd92 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements/docs.txt @@ -0,0 +1,13 @@ +docutils==0.16.0 +mmcv>=2.0.0rc4,<=2.1.0 +mmdet>=3.0.0 +mmengine>=0.7.1 +myst-parser +-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme +sphinx==4.0.2 +sphinx-copybutton +sphinx_markdown_tables +sphinx_rtd_theme==0.5.2 +torch +torchvision +urllib3<2.0.0 diff --git a/models/YOLO-World/third_party/mmyolo/requirements/mminstall.txt b/models/YOLO-World/third_party/mmyolo/requirements/mminstall.txt new file mode 100644 index 0000000000000000000000000000000000000000..843738f7caa0cd20a2c27c07381e960f0923624a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements/mminstall.txt @@ -0,0 +1,3 @@ +mmcv>=2.0.0rc4,<=2.1.0 +mmdet>=3.0.0 +mmengine>=0.7.1 diff --git a/models/YOLO-World/third_party/mmyolo/requirements/mmpose.txt b/models/YOLO-World/third_party/mmyolo/requirements/mmpose.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e4726e68452ccd045940fa9df95681d9d44c2cf --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements/mmpose.txt @@ -0,0 +1 @@ +mmpose>=1.0.0 diff --git a/models/YOLO-World/third_party/mmyolo/requirements/mmrotate.txt b/models/YOLO-World/third_party/mmyolo/requirements/mmrotate.txt new file mode 100644 index 0000000000000000000000000000000000000000..15f05d38e76ce50f84535abcbe40109aadd1e1cb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements/mmrotate.txt @@ -0,0 +1 @@ +mmrotate>=1.0.0rc1 diff --git a/models/YOLO-World/third_party/mmyolo/requirements/runtime.txt b/models/YOLO-World/third_party/mmyolo/requirements/runtime.txt new file mode 100644 index 0000000000000000000000000000000000000000..794a9cab5748caf8059c4a610e7782bef321841f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements/runtime.txt @@ -0,0 +1,2 @@ +numpy +prettytable diff --git a/models/YOLO-World/third_party/mmyolo/requirements/sahi.txt b/models/YOLO-World/third_party/mmyolo/requirements/sahi.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e7b7b842fdc0ead64ce78615c99daa7420bddb9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements/sahi.txt @@ -0,0 +1 @@ +sahi>=0.11.4 diff --git a/models/YOLO-World/third_party/mmyolo/requirements/tests.txt b/models/YOLO-World/third_party/mmyolo/requirements/tests.txt new file mode 100644 index 0000000000000000000000000000000000000000..285b3f3969a2137639e694b3b1652166bc43b177 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/requirements/tests.txt @@ -0,0 +1,17 @@ +flake8 +interrogate +isort==4.3.21 +# Note: used for kwarray.group_items, this may be ported to mmcv in the future. +kwarray +memory_profiler +mmcls>=1.0.0rc4 +mmpose>=1.0.0 +mmrazor>=1.0.0rc2 +mmrotate>=1.0.0rc1 +parameterized +protobuf<=3.20.1 +psutil +pytest +ubelt +xdoctest>=0.10.0 +yapf diff --git a/models/YOLO-World/third_party/mmyolo/setup.cfg b/models/YOLO-World/third_party/mmyolo/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..d30673d0f6242fef3381b4171f9ec208b7f7bc3d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/setup.cfg @@ -0,0 +1,21 @@ +[isort] +line_length = 79 +multi_line_output = 0 +extra_standard_library = setuptools +known_first_party = mmyolo +known_third_party = PIL,asynctest,cityscapesscripts,cv2,gather_models,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,parameterized,pytorch_sphinx_theme,requests,scipy,seaborn,six,terminaltables,torch,ts,yaml,mmengine,mmdet,mmdeploy +no_lines_before = STDLIB,LOCALFOLDER +default_section = THIRDPARTY + +[yapf] +BASED_ON_STYLE = pep8 +BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true +SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true + +# ignore-words-list needs to be lowercase format. For example, if we want to +# ignore word "BA", then we need to append "ba" to ignore-words-list rather +# than "BA" +[codespell] +skip = *.ipynb +quiet-level = 3 +ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,tood,ba,warmup,elease,dota diff --git a/models/YOLO-World/third_party/mmyolo/setup.py b/models/YOLO-World/third_party/mmyolo/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..f37c89791fee95fb321d66a479f13420f64aa5b9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/setup.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import platform +import shutil +import sys +import warnings +from setuptools import find_packages, setup + +from torch.utils.cpp_extension import BuildExtension + + +def readme(): + with open('README.md', encoding='utf-8') as f: + content = f.read() + return content + + +version_file = 'mmyolo/version.py' + + +def get_version(): + with open(version_file) as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +def parse_requirements(fname='requirements.txt', with_version=True): + """Parse the package dependencies listed in a requirements file but strips + specific versioning information. + + Args: + fname (str): path to requirements file + with_version (bool, default=False): if True include version specs + + Returns: + List[str]: list of requirements items + + CommandLine: + python -c "import setup; print(setup.parse_requirements())" + """ + import re + import sys + from os.path import exists + require_fpath = fname + + def parse_line(line): + """Parse information from a line in a requirements text file.""" + if line.startswith('-r '): + # Allow specifying requirements in other files + target = line.split(' ')[1] + for info in parse_require_file(target): + yield info + else: + info = {'line': line} + if line.startswith('-e '): + info['package'] = line.split('#egg=')[1] + elif '@git+' in line: + info['package'] = line + else: + # Remove versioning from the package + pat = '(' + '|'.join(['>=', '==', '>']) + ')' + parts = re.split(pat, line, maxsplit=1) + parts = [p.strip() for p in parts] + + info['package'] = parts[0] + if len(parts) > 1: + op, rest = parts[1:] + if ';' in rest: + # Handle platform specific dependencies + # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies + version, platform_deps = map(str.strip, + rest.split(';')) + info['platform_deps'] = platform_deps + else: + version = rest # NOQA + info['version'] = (op, version) + yield info + + def parse_require_file(fpath): + with open(fpath) as f: + for line in f.readlines(): + line = line.strip() + if line and not line.startswith('#'): + yield from parse_line(line) + + def gen_packages_items(): + if exists(require_fpath): + for info in parse_require_file(require_fpath): + parts = [info['package']] + if with_version and 'version' in info: + parts.extend(info['version']) + if not sys.version.startswith('3.4'): + # apparently package_deps are broken in 3.4 + platform_deps = info.get('platform_deps') + if platform_deps is not None: + parts.append(';' + platform_deps) + item = ''.join(parts) + yield item + + packages = list(gen_packages_items()) + return packages + + +def add_mim_extension(): + """Add extra files that are required to support MIM into the package. + + These files will be added by creating a symlink to the originals if the + package is installed in `editable` mode (e.g. pip install -e .), or by + copying from the originals otherwise. + """ + + # parse installment mode + if 'develop' in sys.argv: + # installed by `pip install -e .` + if platform.system() == 'Windows': + # set `copy` mode here since symlink fails on Windows. + mode = 'copy' + else: + mode = 'symlink' + elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv: + # installed by `pip install .` + # or create source distribution by `python setup.py sdist` + mode = 'copy' + else: + return + + filenames = ['tools', 'configs', 'demo', 'model-index.yml'] + repo_path = osp.dirname(__file__) + mim_path = osp.join(repo_path, 'mmyolo', '.mim') + os.makedirs(mim_path, exist_ok=True) + + for filename in filenames: + if osp.exists(filename): + src_path = osp.join(repo_path, filename) + tar_path = osp.join(mim_path, filename) + + if osp.isfile(tar_path) or osp.islink(tar_path): + os.remove(tar_path) + elif osp.isdir(tar_path): + shutil.rmtree(tar_path) + + if mode == 'symlink': + src_relpath = osp.relpath(src_path, osp.dirname(tar_path)) + os.symlink(src_relpath, tar_path) + elif mode == 'copy': + if osp.isfile(src_path): + shutil.copyfile(src_path, tar_path) + elif osp.isdir(src_path): + shutil.copytree(src_path, tar_path) + else: + warnings.warn(f'Cannot copy file {src_path}.') + else: + raise ValueError(f'Invalid mode {mode}') + + +if __name__ == '__main__': + add_mim_extension() + setup( + name='mmyolo', + version=get_version(), + description='OpenMMLab Toolbox of YOLO', + long_description=readme(), + long_description_content_type='text/markdown', + author='MMYOLO Contributors', + author_email='openmmlab@gmail.com', + keywords='computer vision, object detection', + url='https://github.com/open-mmlab/mmyolo', + packages=find_packages(exclude=('configs', 'tools', 'demo')), + include_package_data=True, + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + ], + license='GPL License 3.0', + install_requires=parse_requirements('requirements/runtime.txt'), + extras_require={ + 'all': parse_requirements('requirements.txt'), + 'tests': parse_requirements('requirements/tests.txt'), + 'build': parse_requirements('requirements/build.txt'), + 'mim': parse_requirements('requirements/mminstall.txt'), + }, + ext_modules=[], + cmdclass={'build_ext': BuildExtension}, + zip_safe=False) diff --git a/models/YOLO-World/third_party/mmyolo/tests/regression/mmyolo.yml b/models/YOLO-World/third_party/mmyolo/tests/regression/mmyolo.yml new file mode 100644 index 0000000000000000000000000000000000000000..55eaec38e1d7a7d3ef524928a1896c97f39633e4 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/regression/mmyolo.yml @@ -0,0 +1,81 @@ +globals: + codebase_dir: ../mmyolo + checkpoint_force_download: False + images: + input_img: &input_img ../mmyolo/demo/demo.jpg + test_img: &test_img ./tests/data/tiger.jpeg + metric_info: &metric_info + box AP: # named after metafile.Results.Metrics + metric_key: coco/bbox_mAP # eval OrderedDict key name + tolerance: 1 # metric ±n% + multi_value: 100 + convert_image: &convert_image + input_img: *input_img + test_img: *test_img + backend_test: &default_backend_test True + +onnxruntime: + pipeline_ort_static_fp32: &pipeline_ort_static_fp32 + convert_image: *convert_image + backend_test: False + deploy_config: configs/mmyolo/detection_onnxruntime_static.py + + pipeline_ort_dynamic_fp32: &pipeline_ort_dynamic_fp32 + convert_image: *convert_image + backend_test: False + deploy_config: configs/mmyolo/detection_onnxruntime_dynamic.py + +tensorrt: + pipeline_trt_static_fp32: &pipeline_trt_static_fp32_640x640 + convert_image: *convert_image + backend_test: False + deploy_config: configs/mmyolo/detection_tensorrt_static-640x640.py + + pipeline_trt_static_fp16: &pipeline_trt_static_fp16_640x640 + convert_image: *convert_image + backend_test: False + deploy_config: configs/mmyolo/detection_tensorrt-fp16_static-640x640.py + + pipeline_trt_dynamic_fp32: &pipeline_trt_dynamic_fp32 + convert_image: *convert_image + backend_test: *default_backend_test + deploy_config: configs/mmyolo/detection_tensorrt_dynamic-192x192-960x960.py + + pipeline_trt_dynamic_fp16: &pipeline_trt_dynamic_fp16 + convert_image: *convert_image + backend_test: *default_backend_test + deploy_config: configs/mmyolo/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py + +models: + - name: YOLOv5 + metafile: configs/yolov5/metafile.yml + model_configs: + - configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py + pipelines: + - *pipeline_ort_dynamic_fp32 + - *pipeline_trt_dynamic_fp16 + + - name: YOLOv6 + metafile: configs/yolov6/metafile.yml + model_configs: + - configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py + pipelines: + - *pipeline_ort_dynamic_fp32 + - *pipeline_trt_dynamic_fp16 + + - name: YOLOX + metafile: configs/yolox/metafile.yml + model_configs: + - configs/yolox/yolox_s_8xb8-300e_coco.py + pipelines: + - *pipeline_ort_dynamic_fp32 + - *pipeline_trt_dynamic_fp16 + + + - name: RTMDet + metafile: configs/rtmdet/metafile.yml + model_configs: + - configs/rtmdet/rtmdet_s_syncbn_8xb32-300e_coco.py + pipelines: + - *pipeline_ort_dynamic_fp32 + - *pipeline_trt_dynamic_fp16 diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_datasets/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/test_formatting.py b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/test_formatting.py new file mode 100644 index 0000000000000000000000000000000000000000..c75475dfcfb4e32f656a194d55fc162a165107b3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/test_formatting.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import unittest + +import numpy as np +from mmdet.structures import DetDataSample +from mmdet.structures.mask import BitmapMasks +from mmengine.structures import InstanceData, PixelData + +from mmyolo.datasets.transforms import PackDetInputs + + +class TestPackDetInputs(unittest.TestCase): + + def setUp(self): + """Setup the model and optimizer which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + data_prefix = osp.join(osp.dirname(__file__), '../../data') + img_path = osp.join(data_prefix, 'color.jpg') + rng = np.random.RandomState(0) + self.results1 = { + 'img_id': 1, + 'img_path': img_path, + 'ori_shape': (300, 400), + 'img_shape': (600, 800), + 'scale_factor': 2.0, + 'flip': False, + 'img': rng.rand(300, 400), + 'gt_seg_map': rng.rand(300, 400), + 'gt_masks': + BitmapMasks(rng.rand(3, 300, 400), height=300, width=400), + 'gt_bboxes_labels': rng.rand(3, ), + 'gt_ignore_flags': np.array([0, 0, 1], dtype=bool), + 'proposals': rng.rand(2, 4), + 'proposals_scores': rng.rand(2, ) + } + self.results2 = { + 'img_id': 1, + 'img_path': img_path, + 'ori_shape': (300, 400), + 'img_shape': (600, 800), + 'scale_factor': 2.0, + 'flip': False, + 'img': rng.rand(300, 400), + 'gt_seg_map': rng.rand(300, 400), + 'gt_masks': + BitmapMasks(rng.rand(3, 300, 400), height=300, width=400), + 'gt_bboxes_labels': rng.rand(3, ), + 'proposals': rng.rand(2, 4), + 'proposals_scores': rng.rand(2, ) + } + self.results3 = { + 'img_id': 1, + 'img_path': img_path, + 'ori_shape': (300, 400), + 'img_shape': (600, 800), + 'scale_factor': 2.0, + 'flip': False, + 'img': rng.rand(300, 400), + 'gt_seg_map': rng.rand(300, 400), + 'gt_masks': + BitmapMasks(rng.rand(3, 300, 400), height=300, width=400), + 'gt_panoptic_seg': rng.rand(1, 300, 400), + 'gt_bboxes_labels': rng.rand(3, ), + 'proposals': rng.rand(2, 4), + 'proposals_scores': rng.rand(2, ) + } + self.meta_keys = ('img_id', 'img_path', 'ori_shape', 'scale_factor', + 'flip') + + def test_transform(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + results = transform(copy.deepcopy(self.results1)) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], DetDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].ignored_instances, + InstanceData) + self.assertEqual(len(results['data_samples'].gt_instances), 2) + self.assertEqual(len(results['data_samples'].ignored_instances), 1) + self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData) + + def test_transform_without_ignore(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + results = transform(copy.deepcopy(self.results2)) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], DetDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].ignored_instances, + InstanceData) + self.assertEqual(len(results['data_samples'].gt_instances), 3) + self.assertEqual(len(results['data_samples'].ignored_instances), 0) + self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData) + + def test_transform_with_panoptic_seg(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + results = transform(copy.deepcopy(self.results3)) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], DetDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].ignored_instances, + InstanceData) + self.assertEqual(len(results['data_samples'].gt_instances), 3) + self.assertEqual(len(results['data_samples'].ignored_instances), 0) + self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData) + self.assertIsInstance(results['data_samples'].gt_panoptic_seg, + PixelData) + + def test_repr(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + self.assertEqual( + repr(transform), f'PackDetInputs(meta_keys={self.meta_keys})') diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/test_mix_img_transforms.py b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/test_mix_img_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..2e9bf20e39572c946e1b66bdf87626a0c243ac29 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/test_mix_img_transforms.py @@ -0,0 +1,416 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import unittest + +import numpy as np +import torch +from mmdet.structures.bbox import HorizontalBoxes +from mmdet.structures.mask import BitmapMasks, PolygonMasks + +from mmyolo.datasets import YOLOv5CocoDataset +from mmyolo.datasets.transforms import Mosaic, Mosaic9, YOLOv5MixUp, YOLOXMixUp +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestMosaic(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ] + + self.dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'dataset': + self.dataset + } + + def test_transform(self): + # test assertion for invalid img_scale + with self.assertRaises(AssertionError): + transform = Mosaic(img_scale=640) + + # test assertion for invalid probability + with self.assertRaises(AssertionError): + transform = Mosaic(prob=1.5) + + # test assertion for invalid max_cached_images + with self.assertRaises(AssertionError): + transform = Mosaic(use_cached=True, max_cached_images=1) + + transform = Mosaic( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_no_gt(self): + self.results['gt_bboxes'] = np.empty((0, 4), dtype=np.float32) + self.results['gt_bboxes_labels'] = np.empty((0, ), dtype=np.int64) + self.results['gt_ignore_flags'] = np.empty((0, ), dtype=bool) + transform = Mosaic( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertIsInstance(results, dict) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue( + results['gt_bboxes_labels'].shape[0] == results['gt_bboxes']. + shape[0] == results['gt_ignore_flags'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_box_list(self): + transform = Mosaic( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_mask(self): + rng = np.random.RandomState(0) + pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True) + ] + + dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'gt_masks': + PolygonMasks.random(num_masks=3, height=224, width=224, rng=rng), + 'dataset': + dataset + } + transform = Mosaic(img_scale=(12, 10), pre_transform=pre_transform) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestMosaic9(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + rng = np.random.RandomState(0) + self.pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ] + + self.dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'gt_masks': + BitmapMasks(rng.rand(3, 224, 224), height=224, width=224), + 'dataset': + self.dataset + } + + def test_transform(self): + # test assertion for invalid img_scale + with self.assertRaises(AssertionError): + transform = Mosaic9(img_scale=640) + + # test assertion for invalid probability + with self.assertRaises(AssertionError): + transform = Mosaic9(prob=1.5) + + # test assertion for invalid max_cached_images + with self.assertRaises(AssertionError): + transform = Mosaic9(use_cached=True, max_cached_images=1) + + transform = Mosaic9( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_no_gt(self): + self.results['gt_bboxes'] = np.empty((0, 4), dtype=np.float32) + self.results['gt_bboxes_labels'] = np.empty((0, ), dtype=np.int64) + self.results['gt_ignore_flags'] = np.empty((0, ), dtype=bool) + transform = Mosaic9( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertIsInstance(results, dict) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue( + results['gt_bboxes_labels'].shape[0] == results['gt_bboxes']. + shape[0] == results['gt_ignore_flags'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_box_list(self): + transform = Mosaic9( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestYOLOv5MixUp(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ] + self.dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + + self.results = { + 'img': + np.random.random((288, 512, 3)), + 'img_shape': (288, 512), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'dataset': + self.dataset + } + + def test_transform(self): + transform = YOLOv5MixUp(pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (288, 512)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + # test assertion for invalid max_cached_images + with self.assertRaises(AssertionError): + transform = YOLOv5MixUp(use_cached=True, max_cached_images=1) + + def test_transform_with_box_list(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = YOLOv5MixUp(pre_transform=self.pre_transform) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (288, 512)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_mask(self): + rng = np.random.RandomState(0) + pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True) + ] + dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + + results = { + 'img': + np.random.random((288, 512, 3)), + 'img_shape': (288, 512), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'gt_masks': + PolygonMasks.random(num_masks=3, height=288, width=512, rng=rng), + 'dataset': + dataset + } + + transform = YOLOv5MixUp(pre_transform=pre_transform) + results = transform(copy.deepcopy(results)) + self.assertTrue(results['img'].shape[:2] == (288, 512)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestYOLOXMixUp(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + rng = np.random.RandomState(0) + self.pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ] + self.dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'gt_masks': + BitmapMasks(rng.rand(3, 224, 224), height=224, width=224), + 'dataset': + self.dataset + } + + def test_transform(self): + # test assertion for invalid img_scale + with self.assertRaises(AssertionError): + transform = YOLOXMixUp(img_scale=640) + + # test assertion for invalid max_cached_images + with self.assertRaises(AssertionError): + transform = YOLOXMixUp(use_cached=True, max_cached_images=1) + + transform = YOLOXMixUp( + img_scale=(10, 12), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=self.pre_transform) + + # self.results['mix_results'] = [copy.deepcopy(self.results)] + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_boxlist(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = YOLOXMixUp( + img_scale=(10, 12), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=self.pre_transform) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/test_transforms.py b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/test_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b7ea49f875582a343829ec7142ed09a61fe51e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_transforms/test_transforms.py @@ -0,0 +1,493 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import unittest + +import mmcv +import numpy as np +import torch +from mmdet.structures.bbox import HorizontalBoxes +from mmdet.structures.mask import BitmapMasks, PolygonMasks + +from mmyolo.datasets.transforms import (LetterResize, LoadAnnotations, + YOLOv5HSVRandomAug, + YOLOv5KeepRatioResize, + YOLOv5RandomAffine) +from mmyolo.datasets.transforms.transforms import (PPYOLOERandomCrop, + PPYOLOERandomDistort, + YOLOv5CopyPaste) + + +class TestLetterResize(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + rng = np.random.RandomState(0) + self.data_info1 = dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 150, 150]], dtype=np.float32), + batch_shape=np.array([192, 672], dtype=np.int64), + gt_masks=PolygonMasks.random(1, height=300, width=400, rng=rng)) + self.data_info2 = dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 150, 150]], dtype=np.float32)) + self.data_info3 = dict( + img=np.random.random((300, 400, 3)), + batch_shape=np.array([192, 672], dtype=np.int64)) + self.data_info4 = dict(img=np.random.random((300, 400, 3))) + + def test_letter_resize(self): + # Test allow_scale_up + transform = LetterResize(scale=(640, 640), allow_scale_up=False) + results = transform(copy.deepcopy(self.data_info1)) + self.assertEqual(results['img_shape'], (192, 672, 3)) + self.assertTrue( + (results['gt_bboxes'] == np.array([[208., 0., 304., 96.]])).all()) + self.assertTrue((results['batch_shape'] == np.array([192, 672])).all()) + self.assertTrue((results['pad_param'] == np.array([0., 0., 208., + 208.])).all()) + self.assertTrue( + (np.array(results['scale_factor'], dtype=np.float32) <= 1.).all()) + + # Test pad_val + transform = LetterResize(scale=(640, 640), pad_val=dict(img=144)) + results = transform(copy.deepcopy(self.data_info1)) + self.assertEqual(results['img_shape'], (192, 672, 3)) + self.assertTrue( + (results['gt_bboxes'] == np.array([[208., 0., 304., 96.]])).all()) + self.assertTrue((results['batch_shape'] == np.array([192, 672])).all()) + self.assertTrue((results['pad_param'] == np.array([0., 0., 208., + 208.])).all()) + self.assertTrue( + (np.array(results['scale_factor'], dtype=np.float32) <= 1.).all()) + + # Test use_mini_pad + transform = LetterResize(scale=(640, 640), use_mini_pad=True) + results = transform(copy.deepcopy(self.data_info1)) + self.assertEqual(results['img_shape'], (192, 256, 3)) + self.assertTrue((results['gt_bboxes'] == np.array([[0., 0., 96., + 96.]])).all()) + self.assertTrue((results['batch_shape'] == np.array([192, 672])).all()) + self.assertTrue((results['pad_param'] == np.array([0., 0., 0., + 0.])).all()) + self.assertTrue( + (np.array(results['scale_factor'], dtype=np.float32) <= 1.).all()) + + # Test stretch_only + transform = LetterResize(scale=(640, 640), stretch_only=True) + results = transform(copy.deepcopy(self.data_info1)) + self.assertEqual(results['img_shape'], (192, 672, 3)) + self.assertTrue((results['gt_bboxes'] == np.array( + [[0., 0., 251.99998474121094, 96.]])).all()) + self.assertTrue((results['batch_shape'] == np.array([192, 672])).all()) + self.assertTrue((results['pad_param'] == np.array([0., 0., 0., + 0.])).all()) + + # Test + transform = LetterResize(scale=(640, 640), pad_val=dict(img=144)) + for _ in range(5): + input_h, input_w = np.random.randint(100, 700), np.random.randint( + 100, 700) + output_h, output_w = np.random.randint(100, + 700), np.random.randint( + 100, 700) + data_info = dict( + img=np.random.random((input_h, input_w, 3)), + gt_bboxes=np.array([[0, 0, 10, 10]], dtype=np.float32), + batch_shape=np.array([output_h, output_w], dtype=np.int64), + gt_masks=PolygonMasks( + [[np.array([0., 0., 0., 10., 10., 10., 10., 0.])]], + height=input_h, + width=input_w)) + results = transform(data_info) + self.assertEqual(results['img_shape'], (output_h, output_w, 3)) + self.assertTrue( + (results['batch_shape'] == np.array([output_h, + output_w])).all()) + + # Test without batchshape + transform = LetterResize(scale=(640, 640), pad_val=dict(img=144)) + for _ in range(5): + input_h, input_w = np.random.randint(100, 700), np.random.randint( + 100, 700) + data_info = dict( + img=np.random.random((input_h, input_w, 3)), + gt_bboxes=np.array([[0, 0, 10, 10]], dtype=np.float32), + gt_masks=PolygonMasks( + [[np.array([0., 0., 0., 10., 10., 10., 10., 0.])]], + height=input_h, + width=input_w)) + results = transform(data_info) + self.assertEqual(results['img_shape'], (640, 640, 3)) + + # TODO: Testing the existence of multiple scale_factor and pad_param + transform = [ + YOLOv5KeepRatioResize(scale=(32, 32)), + LetterResize(scale=(64, 68), pad_val=dict(img=144)) + ] + for _ in range(5): + input_h, input_w = np.random.randint(100, 700), np.random.randint( + 100, 700) + output_h, output_w = np.random.randint(100, + 700), np.random.randint( + 100, 700) + data_info = dict( + img=np.random.random((input_h, input_w, 3)), + gt_bboxes=np.array([[0, 0, 5, 5]], dtype=np.float32), + batch_shape=np.array([output_h, output_w], dtype=np.int64)) + for t in transform: + data_info = t(data_info) + # because of the "math.round" operation, + # it is unable to strictly restore the original input shape + # we just validate the correctness of scale_factor and pad_param + self.assertIn('scale_factor', data_info) + self.assertIn('pad_param', data_info) + pad_param = data_info['pad_param'].reshape(-1, 2).sum( + 1) # (top, b, l, r) -> (h, w) + scale_factor = np.asarray(data_info['scale_factor']) # (w, h) + + max_long_edge = max((32, 32)) + max_short_edge = min((32, 32)) + scale_factor_keepratio = min( + max_long_edge / max(input_h, input_w), + max_short_edge / min(input_h, input_w)) + validate_shape = np.asarray( + (int(input_h * scale_factor_keepratio), + int(input_w * scale_factor_keepratio))) + scale_factor_keepratio = np.asarray( + (validate_shape[1] / input_w, validate_shape[0] / input_h)) + + scale_factor_letter = ((np.asarray( + (output_h, output_w)) - pad_param) / validate_shape)[::-1] + self.assertTrue(data_info['img_shape'][:2] == (output_h, output_w)) + self.assertTrue((scale_factor == (scale_factor_keepratio * + scale_factor_letter)).all()) + + +class TestYOLOv5KeepRatioResize(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + rng = np.random.RandomState(0) + self.data_info1 = dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 150, 150]], dtype=np.float32), + gt_masks=PolygonMasks.random( + num_masks=1, height=300, width=400, rng=rng)) + self.data_info2 = dict(img=np.random.random((300, 400, 3))) + + def test_yolov5_keep_ratio_resize(self): + # test assertion for invalid keep_ratio + with self.assertRaises(AssertionError): + transform = YOLOv5KeepRatioResize(scale=(640, 640)) + transform.keep_ratio = False + results = transform(copy.deepcopy(self.data_info1)) + + # Test with gt_bboxes + transform = YOLOv5KeepRatioResize(scale=(640, 640)) + results = transform(copy.deepcopy(self.data_info1)) + self.assertTrue(transform.keep_ratio, True) + self.assertEqual(results['img_shape'], (480, 640)) + self.assertTrue( + (results['gt_bboxes'] == np.array([[0., 0., 240., 240.]])).all()) + self.assertTrue((np.array(results['scale_factor'], + dtype=np.float32) == 1.6).all()) + + # Test only img + transform = YOLOv5KeepRatioResize(scale=(640, 640)) + results = transform(copy.deepcopy(self.data_info2)) + self.assertEqual(results['img_shape'], (480, 640)) + self.assertTrue((np.array(results['scale_factor'], + dtype=np.float32) == 1.6).all()) + + +class TestYOLOv5HSVRandomAug(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.data_info = dict( + img=mmcv.imread( + osp.join(osp.dirname(__file__), '../../data/color.jpg'), + 'color')) + + def test_yolov5_hsv_random_aug(self): + # Test with gt_bboxes + transform = YOLOv5HSVRandomAug( + hue_delta=0.015, saturation_delta=0.7, value_delta=0.4) + results = transform(copy.deepcopy(self.data_info)) + self.assertTrue( + results['img'].shape[:2] == self.data_info['img'].shape[:2]) + + +class TestLoadAnnotations(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + data_prefix = osp.join(osp.dirname(__file__), '../../data') + seg_map = osp.join(data_prefix, 'gray.jpg') + self.results = { + 'ori_shape': (300, 400), + 'seg_map_path': + seg_map, + 'instances': [{ + 'bbox': [0, 0, 10, 20], + 'bbox_label': 1, + 'mask': [[0, 0, 0, 20, 10, 20, 10, 0]], + 'ignore_flag': 0 + }, { + 'bbox': [10, 10, 110, 120], + 'bbox_label': 2, + 'mask': [[10, 10, 110, 10, 110, 120, 110, 10]], + 'ignore_flag': 0 + }, { + 'bbox': [50, 50, 60, 80], + 'bbox_label': 2, + 'mask': [[50, 50, 60, 50, 60, 80, 50, 80]], + 'ignore_flag': 1 + }] + } + + def test_load_bboxes(self): + transform = LoadAnnotations( + with_bbox=True, + with_label=False, + with_seg=False, + with_mask=False, + box_type=None) + results = transform(copy.deepcopy(self.results)) + self.assertIn('gt_bboxes', results) + self.assertTrue((results['gt_bboxes'] == np.array([[0, 0, 10, 20], + [10, 10, 110, + 120]])).all()) + self.assertEqual(results['gt_bboxes'].dtype, np.float32) + self.assertTrue( + (results['gt_ignore_flags'] == np.array([False, False])).all()) + self.assertEqual(results['gt_ignore_flags'].dtype, bool) + + # test empty instance + results = transform({}) + self.assertIn('gt_bboxes', results) + self.assertTrue(results['gt_bboxes'].shape == (0, 4)) + self.assertIn('gt_ignore_flags', results) + self.assertTrue(results['gt_ignore_flags'].shape == (0, )) + + def test_load_labels(self): + transform = LoadAnnotations( + with_bbox=False, + with_label=True, + with_seg=False, + with_mask=False, + ) + results = transform(copy.deepcopy(self.results)) + self.assertIn('gt_bboxes_labels', results) + self.assertTrue((results['gt_bboxes_labels'] == np.array([1, + 2])).all()) + self.assertEqual(results['gt_bboxes_labels'].dtype, np.int64) + + # test empty instance + results = transform({}) + self.assertIn('gt_bboxes_labels', results) + self.assertTrue(results['gt_bboxes_labels'].shape == (0, )) + + +class TestYOLOv5RandomAffine(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + } + + def test_transform(self): + # test assertion for invalid translate_ratio + with self.assertRaises(AssertionError): + transform = YOLOv5RandomAffine(max_translate_ratio=1.5) + + # test assertion for invalid scaling_ratio_range + with self.assertRaises(AssertionError): + transform = YOLOv5RandomAffine(scaling_ratio_range=(1.5, 0.5)) + + with self.assertRaises(AssertionError): + transform = YOLOv5RandomAffine(scaling_ratio_range=(0, 0.5)) + + transform = YOLOv5RandomAffine() + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_boxlist(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = YOLOv5RandomAffine() + results = transform(copy.deepcopy(results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestPPYOLOERandomCrop(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + } + + def test_transform(self): + transform = PPYOLOERandomCrop() + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_boxlist(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = PPYOLOERandomCrop() + results = transform(copy.deepcopy(results)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestPPYOLOERandomDistort(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + } + + def test_transform(self): + # test assertion for invalid prob + with self.assertRaises(AssertionError): + transform = PPYOLOERandomDistort( + hue_cfg=dict(min=-18, max=18, prob=1.5)) + + # test assertion for invalid num_distort_func + with self.assertRaises(AssertionError): + transform = PPYOLOERandomDistort(num_distort_func=5) + + transform = PPYOLOERandomDistort() + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_boxlist(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = PPYOLOERandomDistort() + results = transform(copy.deepcopy(results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestYOLOv5CopyPaste(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.data_info = dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 10, 10]], dtype=np.float32), + gt_masks=PolygonMasks( + [[np.array([0., 0., 0., 10., 10., 10., 10., 0.])]], + height=300, + width=400)) + + def test_transform(self): + # test transform + transform = YOLOv5CopyPaste(prob=1.0) + results = transform(copy.deepcopy(self.data_info)) + self.assertTrue(len(results['gt_bboxes']) == 2) + self.assertTrue(len(results['gt_masks']) == 2) + + rng = np.random.RandomState(0) + # test with bitmap + with self.assertRaises(AssertionError): + results = transform( + dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 10, 10]], dtype=np.float32), + gt_masks=BitmapMasks( + rng.rand(1, 300, 400), height=300, width=400))) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_utils.py b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dc7b9022650fd49ed4283858bb030852191260c8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_utils.py @@ -0,0 +1,138 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest + +import numpy as np +import torch +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import HorizontalBoxes +from mmengine.structures import InstanceData + +from mmyolo.datasets import BatchShapePolicy, yolov5_collate + + +def _rand_bboxes(rng, num_boxes, w, h): + cx, cy, bw, bh = rng.rand(num_boxes, 4).T + + tl_x = ((cx * w) - (w * bw / 2)).clip(0, w) + tl_y = ((cy * h) - (h * bh / 2)).clip(0, h) + br_x = ((cx * w) + (w * bw / 2)).clip(0, w) + br_y = ((cy * h) + (h * bh / 2)).clip(0, h) + + bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T + return bboxes + + +class TestYOLOv5Collate(unittest.TestCase): + + def test_yolov5_collate(self): + rng = np.random.RandomState(0) + + inputs = torch.randn((3, 10, 10)) + data_samples = DetDataSample() + gt_instances = InstanceData() + bboxes = _rand_bboxes(rng, 4, 6, 8) + gt_instances.bboxes = HorizontalBoxes(bboxes, dtype=torch.float32) + labels = rng.randint(1, 2, size=len(bboxes)) + gt_instances.labels = torch.LongTensor(labels) + data_samples.gt_instances = gt_instances + + out = yolov5_collate([dict(inputs=inputs, data_samples=data_samples)]) + self.assertIsInstance(out, dict) + self.assertTrue(out['inputs'].shape == (1, 3, 10, 10)) + self.assertTrue(out['data_samples'], dict) + self.assertTrue(out['data_samples']['bboxes_labels'].shape == (4, 6)) + + out = yolov5_collate([dict(inputs=inputs, data_samples=data_samples)] * + 2) + self.assertIsInstance(out, dict) + self.assertTrue(out['inputs'].shape == (2, 3, 10, 10)) + self.assertTrue(out['data_samples'], dict) + self.assertTrue(out['data_samples']['bboxes_labels'].shape == (8, 6)) + + def test_yolov5_collate_with_multi_scale(self): + rng = np.random.RandomState(0) + + inputs = torch.randn((3, 10, 10)) + data_samples = DetDataSample() + gt_instances = InstanceData() + bboxes = _rand_bboxes(rng, 4, 6, 8) + gt_instances.bboxes = HorizontalBoxes(bboxes, dtype=torch.float32) + labels = rng.randint(1, 2, size=len(bboxes)) + gt_instances.labels = torch.LongTensor(labels) + data_samples.gt_instances = gt_instances + + out = yolov5_collate([dict(inputs=inputs, data_samples=data_samples)], + use_ms_training=True) + self.assertIsInstance(out, dict) + self.assertTrue(out['inputs'][0].shape == (3, 10, 10)) + self.assertTrue(out['data_samples'], dict) + self.assertTrue(out['data_samples']['bboxes_labels'].shape == (4, 6)) + self.assertIsInstance(out['inputs'], list) + self.assertIsInstance(out['data_samples']['bboxes_labels'], + torch.Tensor) + + out = yolov5_collate( + [dict(inputs=inputs, data_samples=data_samples)] * 2, + use_ms_training=True) + self.assertIsInstance(out, dict) + self.assertTrue(out['inputs'][0].shape == (3, 10, 10)) + self.assertTrue(out['data_samples'], dict) + self.assertTrue(out['data_samples']['bboxes_labels'].shape == (8, 6)) + self.assertIsInstance(out['inputs'], list) + self.assertIsInstance(out['data_samples']['bboxes_labels'], + torch.Tensor) + + +class TestBatchShapePolicy(unittest.TestCase): + + def test_batch_shape_policy(self): + src_data_infos = [{ + 'height': 20, + 'width': 100, + }, { + 'height': 11, + 'width': 100, + }, { + 'height': 21, + 'width': 100, + }, { + 'height': 30, + 'width': 100, + }, { + 'height': 10, + 'width': 100, + }] + + expected_data_infos = [{ + 'height': 10, + 'width': 100, + 'batch_shape': np.array([96, 672]) + }, { + 'height': 11, + 'width': 100, + 'batch_shape': np.array([96, 672]) + }, { + 'height': 20, + 'width': 100, + 'batch_shape': np.array([160, 672]) + }, { + 'height': 21, + 'width': 100, + 'batch_shape': np.array([160, 672]) + }, { + 'height': 30, + 'width': 100, + 'batch_shape': np.array([224, 672]) + }] + + batch_shapes_policy = BatchShapePolicy(batch_size=2) + out_data_infos = batch_shapes_policy(src_data_infos) + + for i in range(5): + self.assertEqual( + (expected_data_infos[i]['height'], + expected_data_infos[i]['width']), + (out_data_infos[i]['height'], out_data_infos[i]['width'])) + self.assertTrue( + np.allclose(expected_data_infos[i]['batch_shape'], + out_data_infos[i]['batch_shape'])) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_yolov5_coco.py b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_yolov5_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b7e1c9a43077e7e34f36b2ecda5b3235cfa9bd75 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_yolov5_coco.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest + +from mmyolo.datasets import YOLOv5CocoDataset + + +class TestYOLOv5CocoDataset(unittest.TestCase): + + def test_batch_shapes_cfg(self): + batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=2, + img_size=640, + size_divisor=32, + extra_pad_ratio=0.5) + + # test serialize_data=True + dataset = YOLOv5CocoDataset( + data_prefix=dict(img='imgs'), + ann_file='tests/data/coco_sample.json', + filter_cfg=dict(filter_empty_gt=False, min_size=0), + pipeline=[], + serialize_data=True, + batch_shapes_cfg=batch_shapes_cfg, + ) + + expected_img_ids = [3, 0, 2, 1] + expected_batch_shapes = [[512, 672], [512, 672], [672, 672], + [672, 672]] + for i, data in enumerate(dataset): + assert data['img_id'] == expected_img_ids[i] + assert data['batch_shape'].tolist() == expected_batch_shapes[i] + + # test serialize_data=True + dataset = YOLOv5CocoDataset( + data_prefix=dict(img='imgs'), + ann_file='tests/data/coco_sample.json', + filter_cfg=dict(filter_empty_gt=False, min_size=0), + pipeline=[], + serialize_data=False, + batch_shapes_cfg=batch_shapes_cfg, + ) + + expected_img_ids = [3, 0, 2, 1] + expected_batch_shapes = [[512, 672], [512, 672], [672, 672], + [672, 672]] + for i, data in enumerate(dataset): + assert data['img_id'] == expected_img_ids[i] + assert data['batch_shape'].tolist() == expected_batch_shapes[i] + + def test_prepare_data(self): + dataset = YOLOv5CocoDataset( + data_prefix=dict(img='imgs'), + ann_file='tests/data/coco_sample.json', + filter_cfg=dict(filter_empty_gt=False, min_size=0), + pipeline=[], + serialize_data=True, + batch_shapes_cfg=None, + ) + for data in dataset: + assert 'dataset' in data + + # test with test_mode = True + dataset = YOLOv5CocoDataset( + data_prefix=dict(img='imgs'), + ann_file='tests/data/coco_sample.json', + test_mode=True, + pipeline=[]) + + for data in dataset: + assert 'dataset' not in data diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_yolov5_voc.py b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_yolov5_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..f7e9b989c8b390624a2c1996b8ca534a0b000b56 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_datasets/test_yolov5_voc.py @@ -0,0 +1,86 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest + +from mmengine.dataset import ConcatDataset + +from mmyolo.datasets import YOLOv5VOCDataset +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv5VocDataset(unittest.TestCase): + + def test_batch_shapes_cfg(self): + batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=2, + img_size=640, + size_divisor=32, + extra_pad_ratio=0.5) + + # test serialize_data=True + dataset = YOLOv5VOCDataset( + data_root='tests/data/VOCdevkit/', + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + test_mode=True, + pipeline=[], + batch_shapes_cfg=batch_shapes_cfg, + ) + + expected_img_ids = ['000001'] + expected_batch_shapes = [[672, 480]] + for i, data in enumerate(dataset): + assert data['img_id'] == expected_img_ids[i] + assert data['batch_shape'].tolist() == expected_batch_shapes[i] + + def test_prepare_data(self): + dataset = YOLOv5VOCDataset( + data_root='tests/data/VOCdevkit/', + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict(filter_empty_gt=False, min_size=0), + pipeline=[], + serialize_data=True, + batch_shapes_cfg=None, + ) + for data in dataset: + assert 'dataset' in data + + # test with test_mode = True + dataset = YOLOv5VOCDataset( + data_root='tests/data/VOCdevkit/', + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict( + filter_empty_gt=True, min_size=32, bbox_min_size=None), + pipeline=[], + test_mode=True, + batch_shapes_cfg=None) + + for data in dataset: + assert 'dataset' not in data + + def test_concat_dataset(self): + dataset = ConcatDataset( + datasets=[ + dict( + type='YOLOv5VOCDataset', + data_root='tests/data/VOCdevkit/', + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]), + dict( + type='YOLOv5VOCDataset', + data_root='tests/data/VOCdevkit/', + ann_file='VOC2012/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2012/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + ], + ignore_keys='dataset_type') + + dataset.full_init() + self.assertEqual(len(dataset), 2) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_deploy/conftest.py b/models/YOLO-World/third_party/mmyolo/tests/test_deploy/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..ed1bd3d88905e39928d9bc1c1803844d59f92ad9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_deploy/conftest.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest + + +@pytest.fixture(autouse=True) +def init_test(): + # init default scope + from mmdet.utils import register_all_modules as register_det + + from mmyolo.utils import register_all_modules as register_yolo + + register_yolo(True) + register_det(False) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_deploy/test_mmyolo_models.py b/models/YOLO-World/third_party/mmyolo/tests/test_deploy/test_mmyolo_models.py new file mode 100644 index 0000000000000000000000000000000000000000..65394e539aa5b8dca39c17012aa8b805ca69bc39 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_deploy/test_mmyolo_models.py @@ -0,0 +1,165 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import random + +import numpy as np +import pytest +import torch +from mmengine import Config + +try: + import importlib + importlib.import_module('mmdeploy') +except ImportError: + pytest.skip('mmdeploy is not installed.', allow_module_level=True) + +from mmdeploy.codebase import import_codebase +from mmdeploy.utils import Backend +from mmdeploy.utils.config_utils import register_codebase +from mmdeploy.utils.test import (WrapModel, check_backend, get_model_outputs, + get_rewrite_outputs) + +try: + codebase = register_codebase('mmyolo') + import_codebase(codebase, ['mmyolo.deploy']) +except ImportError: + pytest.skip('mmyolo is not installed.', allow_module_level=True) + + +def seed_everything(seed=1029): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.enabled = False + + +def get_yolov5_head_model(): + """YOLOv5 Head Config.""" + test_cfg = Config( + dict( + multi_label=True, + nms_pre=30000, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=300)) + + from mmyolo.models.dense_heads import YOLOv5Head + head_module = dict( + type='YOLOv5HeadModule', + num_classes=4, + in_channels=[2, 4, 8], + featmap_strides=[8, 16, 32], + num_base_priors=1) + + model = YOLOv5Head(head_module, test_cfg=test_cfg) + + model.requires_grad_(False) + return model + + +@pytest.mark.parametrize('backend_type', [Backend.ONNXRUNTIME]) +def test_yolov5_head_predict_by_feat(backend_type: Backend): + """Test predict_by_feat rewrite of YOLOXHead.""" + check_backend(backend_type) + yolov5_head = get_yolov5_head_model() + yolov5_head.cpu().eval() + s = 256 + batch_img_metas = [{ + 'scale_factor': (1.0, 1.0), + 'pad_shape': (s, s, 3), + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3) + }] + output_names = ['dets', 'labels'] + deploy_cfg = Config( + dict( + backend_config=dict(type=backend_type.value), + onnx_config=dict(output_names=output_names, input_shape=None), + codebase_config=dict( + type='mmyolo', + task='ObjectDetection', + post_processing=dict( + score_threshold=0.05, + iou_threshold=0.5, + max_output_boxes_per_class=20, + pre_top_k=-1, + keep_top_k=10, + background_label_id=-1, + ), + module=['mmyolo.deploy']))) + seed_everything(1234) + cls_scores = [ + torch.rand(1, yolov5_head.num_classes * yolov5_head.num_base_priors, + 4 * pow(2, i), 4 * pow(2, i)) for i in range(3, 0, -1) + ] + seed_everything(5678) + bbox_preds = [ + torch.rand(1, 4 * yolov5_head.num_base_priors, 4 * pow(2, i), + 4 * pow(2, i)) for i in range(3, 0, -1) + ] + seed_everything(9101) + objectnesses = [ + torch.rand(1, 1 * yolov5_head.num_base_priors, 4 * pow(2, i), + 4 * pow(2, i)) for i in range(3, 0, -1) + ] + + # to get outputs of pytorch model + model_inputs = { + 'cls_scores': cls_scores, + 'bbox_preds': bbox_preds, + 'objectnesses': objectnesses, + 'batch_img_metas': batch_img_metas, + 'with_nms': True + } + model_outputs = get_model_outputs(yolov5_head, 'predict_by_feat', + model_inputs) + + # to get outputs of onnx model after rewrite + wrapped_model = WrapModel( + yolov5_head, + 'predict_by_feat', + batch_img_metas=batch_img_metas, + with_nms=True) + rewrite_inputs = { + 'cls_scores': cls_scores, + 'bbox_preds': bbox_preds, + 'objectnesses': objectnesses, + } + rewrite_outputs, is_backend_output = get_rewrite_outputs( + wrapped_model=wrapped_model, + model_inputs=rewrite_inputs, + deploy_cfg=deploy_cfg) + + if is_backend_output: + # hard code to make two tensors with the same shape + # rewrite and original codes applied different nms strategy + min_shape = min(model_outputs[0].bboxes.shape[0], + rewrite_outputs[0].shape[1], 5) + for i in range(len(model_outputs)): + rewrite_outputs[0][i, :min_shape, 0::2] = \ + rewrite_outputs[0][i, :min_shape, 0::2].clamp_(0, s) + rewrite_outputs[0][i, :min_shape, 1::2] = \ + rewrite_outputs[0][i, :min_shape, 1::2].clamp_(0, s) + assert np.allclose( + model_outputs[i].bboxes[:min_shape], + rewrite_outputs[0][i, :min_shape, :4], + rtol=1e-03, + atol=1e-05) + assert np.allclose( + model_outputs[i].scores[:min_shape], + rewrite_outputs[0][i, :min_shape, 4], + rtol=1e-03, + atol=1e-05) + assert np.allclose( + model_outputs[i].labels[:min_shape], + rewrite_outputs[1][i, :min_shape], + rtol=1e-03, + atol=1e-05) + else: + assert rewrite_outputs is not None diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_deploy/test_object_detection.py b/models/YOLO-World/third_party/mmyolo/tests/test_deploy/test_object_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..b701e2557699de14d5e42679740e67706fa3bf6d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_deploy/test_object_detection.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from tempfile import NamedTemporaryFile, TemporaryDirectory + +import numpy as np +import pytest +import torch +from mmengine import Config + +try: + import importlib + importlib.import_module('mmdeploy') +except ImportError: + pytest.skip('mmdeploy is not installed.', allow_module_level=True) + +import mmdeploy.backend.onnxruntime as ort_apis +from mmdeploy.apis import build_task_processor +from mmdeploy.codebase import import_codebase +from mmdeploy.utils import load_config +from mmdeploy.utils.config_utils import register_codebase +from mmdeploy.utils.test import SwitchBackendWrapper + +try: + codebase = register_codebase('mmyolo') + import_codebase(codebase, ['mmyolo.deploy']) +except ImportError: + pytest.skip('mmyolo is not installed.', allow_module_level=True) + +model_cfg_path = 'tests/test_deploy/data/model.py' +model_cfg = load_config(model_cfg_path)[0] +model_cfg.test_dataloader.dataset.data_root = \ + 'tests/data' +model_cfg.test_dataloader.dataset.ann_file = 'coco_sample.json' +model_cfg.test_evaluator.ann_file = \ + 'tests/coco_sample.json' +deploy_cfg = Config( + dict( + backend_config=dict(type='onnxruntime'), + codebase_config=dict( + type='mmyolo', + task='ObjectDetection', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, # for YOLOv3 + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1, + ), + module=['mmyolo.deploy']), + onnx_config=dict( + type='onnx', + export_params=True, + keep_initializers_as_inputs=False, + opset_version=11, + input_shape=None, + input_names=['input'], + output_names=['dets', 'labels']))) +onnx_file = NamedTemporaryFile(suffix='.onnx').name +task_processor = None +img_shape = (32, 32) +img = np.random.rand(*img_shape, 3) + + +@pytest.fixture(autouse=True) +def init_task_processor(): + global task_processor + task_processor = build_task_processor(model_cfg, deploy_cfg, 'cpu') + + +@pytest.fixture +def backend_model(): + from mmdeploy.backend.onnxruntime import ORTWrapper + ort_apis.__dict__.update({'ORTWrapper': ORTWrapper}) + wrapper = SwitchBackendWrapper(ORTWrapper) + wrapper.set( + outputs={ + 'dets': torch.rand(1, 10, 5).sort(2).values, + 'labels': torch.randint(0, 10, (1, 10)) + }) + + yield task_processor.build_backend_model(['']) + + wrapper.recover() + + +def test_visualize(backend_model): + img_path = 'tests/data/color.jpg' + input_dict, _ = task_processor.create_input( + img_path, input_shape=img_shape) + results = backend_model.test_step(input_dict)[0] + with TemporaryDirectory() as dir: + filename = dir + 'tmp.jpg' + task_processor.visualize(img, results, filename, 'window') + assert os.path.exists(filename) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_downstream/test_mmrazor.py b/models/YOLO-World/third_party/mmyolo/tests/test_downstream/test_mmrazor.py new file mode 100644 index 0000000000000000000000000000000000000000..dc3090d263853e871fb70950be0acd845e19a238 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_downstream/test_mmrazor.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import pytest +from mmcls.models.backbones.base_backbone import BaseBackbone + +from mmyolo.testing import get_detector_cfg + + +@pytest.mark.parametrize('cfg_file', [ + 'razor/subnets/' + 'yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py', 'razor/subnets/' + 'rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py', 'razor/subnets/' + 'yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py' +]) +def test_razor_backbone_init(cfg_file): + model = get_detector_cfg(cfg_file) + model_cfg = copy.deepcopy(model.backbone) + from mmrazor.registry import MODELS + model = MODELS.build(model_cfg) + assert isinstance(model, BaseBackbone) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_engine/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_engine/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_hooks/test_switch_to_deploy_hook.py b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_hooks/test_switch_to_deploy_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..52d6e9f0583923feff08cf1cc6f41c8223503d88 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_hooks/test_switch_to_deploy_hook.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase +from unittest.mock import Mock + +from mmyolo.engine.hooks import SwitchToDeployHook +from mmyolo.models import RepVGGBlock +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestSwitchToDeployHook(TestCase): + + def test(self): + + runner = Mock() + runner.model = RepVGGBlock(256, 256) + + hook = SwitchToDeployHook() + self.assertFalse(runner.model.deploy) + + # test after change mode + hook.before_test_epoch(runner) + self.assertTrue(runner.model.deploy) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_hooks/test_yolov5_param_scheduler_hook.py b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_hooks/test_yolov5_param_scheduler_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..1a527333023a179d95b8cd41b82fa5fd9842c0c6 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_hooks/test_yolov5_param_scheduler_hook.py @@ -0,0 +1,124 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase +from unittest.mock import Mock + +import torch +from mmengine.config import Config +from mmengine.optim import build_optim_wrapper +from mmengine.runner import Runner +from torch import nn +from torch.utils.data import Dataset + +from mmyolo.engine.hooks import YOLOv5ParamSchedulerHook +from mmyolo.utils import register_all_modules + + +class ToyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(2, 1) + + def forward(self, inputs, data_samples, mode='tensor'): + labels = torch.stack(data_samples) + inputs = torch.stack(inputs) + outputs = self.linear(inputs) + if mode == 'tensor': + return outputs + elif mode == 'loss': + loss = (labels - outputs).sum() + outputs = dict(loss=loss) + return outputs + else: + return outputs + + +class DummyDataset(Dataset): + METAINFO = dict() # type: ignore + data = torch.randn(12, 2) + label = torch.ones(12) + + @property + def metainfo(self): + return self.METAINFO + + def __len__(self): + return self.data.size(0) + + def __getitem__(self, index): + return dict(inputs=self.data[index], data_sample=self.label[index]) + + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=0.01, + momentum=0.937, + weight_decay=0.0005, + nesterov=True, + batch_size_per_gpu=1), + constructor='YOLOv5OptimizerConstructor') + +register_all_modules() + + +class TestYOLOv5ParamSchelerHook(TestCase): + + def test(self): + model = ToyModel() + train_dataloader = dict( + dataset=DummyDataset(), + sampler=dict(type='DefaultSampler', shuffle=True), + batch_size=3, + num_workers=0) + + runner = Mock() + runner.model = model + runner.optim_wrapper = build_optim_wrapper(model, optim_wrapper) + runner.cfg.train_dataloader = Config(train_dataloader) + runner.train_dataloader = Runner.build_dataloader(train_dataloader) + + hook = YOLOv5ParamSchedulerHook( + scheduler_type='linear', lr_factor=0.01, max_epochs=300) + + # test before train + runner.epoch = 0 + runner.iter = 0 + hook.before_train(runner) + + for group in runner.optim_wrapper.param_groups: + self.assertEqual(group['lr'], 0.01) + self.assertEqual(group['momentum'], 0.937) + + self.assertFalse(hook._warmup_end) + + # test after training 10 steps + for i in range(10): + runner.iter += 1 + hook.before_train_iter(runner, 0) + + for group_idx, group in enumerate(runner.optim_wrapper.param_groups): + if group_idx == 2: + self.assertEqual(round(group['lr'], 5), 0.0991) + self.assertEqual(group['momentum'], 0.80137) + self.assertFalse(hook._warmup_end) + + # test after warm up + runner.iter = 1000 + hook.before_train_iter(runner, 0) + self.assertFalse(hook._warmup_end) + + for group in runner.optim_wrapper.param_groups: + self.assertEqual(group['lr'], 0.01) + self.assertEqual(group['momentum'], 0.937) + + runner.iter = 1001 + hook.before_train_iter(runner, 0) + self.assertTrue(hook._warmup_end) + + # test after train_epoch + hook.after_train_epoch(runner) + for group in runner.optim_wrapper.param_groups: + self.assertEqual(group['lr'], 0.01) + self.assertEqual(group['momentum'], 0.937) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_hooks/test_yolox_mode_switch_hook.py b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_hooks/test_yolox_mode_switch_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..fbe13413c4c2abf6369e3e439de63044dc68444c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_hooks/test_yolox_mode_switch_hook.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase +from unittest.mock import Mock + +import torch +from mmengine.config import Config +from mmengine.runner import Runner +from torch.utils.data import Dataset + +from mmyolo.engine.hooks import YOLOXModeSwitchHook +from mmyolo.utils import register_all_modules + + +class DummyDataset(Dataset): + METAINFO = dict() # type: ignore + data = torch.randn(12, 2) + label = torch.ones(12) + + @property + def metainfo(self): + return self.METAINFO + + def __len__(self): + return self.data.size(0) + + def __getitem__(self, index): + return dict(inputs=self.data[index], data_sample=self.label[index]) + + +pipeline1 = [ + dict(type='mmdet.Resize'), +] + +pipeline2 = [ + dict(type='mmdet.RandomFlip'), +] +register_all_modules() + + +class TestYOLOXModeSwitchHook(TestCase): + + def test(self): + train_dataloader = dict( + dataset=DummyDataset(), + sampler=dict(type='DefaultSampler', shuffle=True), + batch_size=3, + num_workers=0) + + runner = Mock() + runner.model = Mock() + runner.model.module = Mock() + + runner.model.bbox_head.use_bbox_aux = False + runner.cfg.train_dataloader = Config(train_dataloader) + runner.train_dataloader = Runner.build_dataloader(train_dataloader) + runner.train_dataloader.dataset.pipeline = pipeline1 + + hook = YOLOXModeSwitchHook( + num_last_epochs=15, new_train_pipeline=pipeline2) + + # test after change mode + runner.epoch = 284 + runner.max_epochs = 300 + hook.before_train_epoch(runner) + self.assertTrue(runner.model.bbox_head.use_bbox_aux) + self.assertEqual(runner.train_loop.dataloader.dataset.pipeline, + pipeline2) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_optimizers/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_optimizers/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov5_optim_constructor.py b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov5_optim_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..4830e5cd604f99bb40f783c4815e124a37f11c96 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov5_optim_constructor.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import copy +from unittest import TestCase + +import torch +import torch.nn as nn +from mmengine.optim import build_optim_wrapper + +from mmyolo.engine import YOLOv5OptimizerConstructor +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class ExampleModel(nn.Module): + + def __init__(self): + super().__init__() + self.param1 = nn.Parameter(torch.ones(1)) + self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False) + self.conv2 = nn.Conv2d(4, 2, kernel_size=1) + self.bn = nn.BatchNorm2d(2) + + +class TestYOLOv5OptimizerConstructor(TestCase): + + def setUp(self): + self.model = ExampleModel() + self.base_lr = 0.01 + self.weight_decay = 0.0001 + self.optim_wrapper_cfg = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=self.base_lr, + momentum=0.9, + weight_decay=self.weight_decay, + batch_size_per_gpu=16)) + + def test_init(self): + YOLOv5OptimizerConstructor(copy.deepcopy(self.optim_wrapper_cfg)) + YOLOv5OptimizerConstructor( + copy.deepcopy(self.optim_wrapper_cfg), + paramwise_cfg={'base_total_batch_size': 64}) + + # `paramwise_cfg` must include `base_total_batch_size` if not None. + with self.assertRaises(AssertionError): + YOLOv5OptimizerConstructor( + copy.deepcopy(self.optim_wrapper_cfg), paramwise_cfg={'a': 64}) + + def test_build(self): + optim_wrapper = YOLOv5OptimizerConstructor( + copy.deepcopy(self.optim_wrapper_cfg))( + self.model) + # test param_groups + assert len(optim_wrapper.optimizer.param_groups) == 3 + for i in range(3): + param_groups_i = optim_wrapper.optimizer.param_groups[i] + assert param_groups_i['lr'] == self.base_lr + if i == 0: + assert param_groups_i['weight_decay'] == self.weight_decay + else: + assert param_groups_i['weight_decay'] == 0 + + # test weight_decay linear scaling + optim_wrapper_cfg = copy.deepcopy(self.optim_wrapper_cfg) + optim_wrapper_cfg['optimizer']['batch_size_per_gpu'] = 128 + optim_wrapper = YOLOv5OptimizerConstructor(optim_wrapper_cfg)( + self.model) + assert optim_wrapper.optimizer.param_groups[0][ + 'weight_decay'] == self.weight_decay * 2 + + # test without batch_size_per_gpu + optim_wrapper_cfg = copy.deepcopy(self.optim_wrapper_cfg) + optim_wrapper_cfg['optimizer'].pop('batch_size_per_gpu') + optim_wrapper = dict( + optim_wrapper_cfg, constructor='YOLOv5OptimizerConstructor') + optim_wrapper = build_optim_wrapper(self.model, optim_wrapper) + assert optim_wrapper.optimizer.param_groups[0][ + 'weight_decay'] == self.weight_decay diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov7_optim_wrapper_constructor.py b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov7_optim_wrapper_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..a2f445bedd7b86ffaa00f4c74affa990eaeb663e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov7_optim_wrapper_constructor.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import copy +from unittest import TestCase + +import torch +import torch.nn as nn +from mmengine.optim import build_optim_wrapper + +from mmyolo.engine import YOLOv7OptimWrapperConstructor +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class ExampleModel(nn.Module): + + def __init__(self): + super().__init__() + self.param1 = nn.Parameter(torch.ones(1)) + self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False) + self.conv2 = nn.Conv2d(4, 2, kernel_size=1) + self.bn = nn.BatchNorm2d(2) + + +class TestYOLOv7OptimWrapperConstructor(TestCase): + + def setUp(self): + self.model = ExampleModel() + self.base_lr = 0.01 + self.weight_decay = 0.0001 + self.optim_wrapper_cfg = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=self.base_lr, + momentum=0.9, + weight_decay=self.weight_decay, + batch_size_per_gpu=16)) + + def test_init(self): + YOLOv7OptimWrapperConstructor(copy.deepcopy(self.optim_wrapper_cfg)) + YOLOv7OptimWrapperConstructor( + copy.deepcopy(self.optim_wrapper_cfg), + paramwise_cfg={'base_total_batch_size': 64}) + + # `paramwise_cfg` must include `base_total_batch_size` if not None. + with self.assertRaises(AssertionError): + YOLOv7OptimWrapperConstructor( + copy.deepcopy(self.optim_wrapper_cfg), paramwise_cfg={'a': 64}) + + def test_build(self): + optim_wrapper = YOLOv7OptimWrapperConstructor( + copy.deepcopy(self.optim_wrapper_cfg))( + self.model) + # test param_groups + assert len(optim_wrapper.optimizer.param_groups) == 3 + for i in range(3): + param_groups_i = optim_wrapper.optimizer.param_groups[i] + assert param_groups_i['lr'] == self.base_lr + if i == 0: + assert param_groups_i['weight_decay'] == self.weight_decay + else: + assert param_groups_i['weight_decay'] == 0 + + # test weight_decay linear scaling + optim_wrapper_cfg = copy.deepcopy(self.optim_wrapper_cfg) + optim_wrapper_cfg['optimizer']['batch_size_per_gpu'] = 128 + optim_wrapper = YOLOv7OptimWrapperConstructor(optim_wrapper_cfg)( + self.model) + assert optim_wrapper.optimizer.param_groups[0][ + 'weight_decay'] == self.weight_decay * 2 + + # test without batch_size_per_gpu + optim_wrapper_cfg = copy.deepcopy(self.optim_wrapper_cfg) + optim_wrapper_cfg['optimizer'].pop('batch_size_per_gpu') + optim_wrapper = dict( + optim_wrapper_cfg, constructor='YOLOv7OptimWrapperConstructor') + optim_wrapper = build_optim_wrapper(self.model, optim_wrapper) + assert optim_wrapper.optimizer.param_groups[0][ + 'weight_decay'] == self.weight_decay diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_csp_darknet.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_csp_darknet.py new file mode 100644 index 0000000000000000000000000000000000000000..82dceb55f90558b8d6bec48254640e248e7ba772 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_csp_darknet.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from parameterized import parameterized +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.models.backbones import (YOLOv5CSPDarknet, YOLOv8CSPDarknet, + YOLOXCSPDarknet) +from mmyolo.utils import register_all_modules +from .utils import check_norm_state, is_norm + +register_all_modules() + + +class TestCSPDarknet(TestCase): + + @parameterized.expand([(YOLOv5CSPDarknet, ), (YOLOXCSPDarknet, ), + (YOLOv8CSPDarknet, )]) + def test_init(self, module_class): + # out_indices in range(len(arch_setting) + 1) + with pytest.raises(AssertionError): + module_class(out_indices=(6, )) + + with pytest.raises(ValueError): + # frozen_stages must in range(-1, len(arch_setting) + 1) + module_class(frozen_stages=6) + + @parameterized.expand([(YOLOv5CSPDarknet, ), (YOLOXCSPDarknet, ), + (YOLOv8CSPDarknet, )]) + def test_forward(self, module_class): + # Test CSPDarknet with first stage frozen + frozen_stages = 1 + model = module_class(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test CSPDarknet with norm_eval=True + model = module_class(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test CSPDarknet-P5 forward with widen_factor=0.25 + model = module_class( + arch='P5', widen_factor=0.25, out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 32, 16, 16)) + assert feat[2].shape == torch.Size((1, 64, 8, 8)) + assert feat[3].shape == torch.Size((1, 128, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test CSPDarknet forward with dict(type='ReLU') + model = module_class( + widen_factor=0.125, + act_cfg=dict(type='ReLU'), + out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test CSPDarknet with BatchNorm forward + model = module_class(widen_factor=0.125, out_indices=range(0, 5)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test CSPDarknet with Dropout Block + model = module_class(plugins=[ + dict( + cfg=dict(type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 2 + assert len(model.stage2) == 2 + assert len(model.stage3) == 3 # +DropBlock + assert len(model.stage4) == 4 # +SPPF+DropBlock + model.train() + imgs = torch.randn(1, 3, 256, 256) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 256, 32, 32)) + assert feat[1].shape == torch.Size((1, 512, 16, 16)) + assert feat[2].shape == torch.Size((1, 1024, 8, 8)) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_csp_resnet.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_csp_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0f3c473a8adbf5fa139bff50a7d39006657065 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_csp_resnet.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.models import PPYOLOECSPResNet +from mmyolo.utils import register_all_modules +from .utils import check_norm_state, is_norm + +register_all_modules() + + +class TestPPYOLOECSPResNet(TestCase): + + def test_init(self): + # out_indices in range(len(arch_setting) + 1) + with pytest.raises(AssertionError): + PPYOLOECSPResNet(out_indices=(6, )) + + with pytest.raises(ValueError): + # frozen_stages must in range(-1, len(arch_setting) + 1) + PPYOLOECSPResNet(frozen_stages=6) + + def test_forward(self): + # Test PPYOLOECSPResNet with first stage frozen + frozen_stages = 1 + model = PPYOLOECSPResNet(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test PPYOLOECSPResNet with norm_eval=True + model = PPYOLOECSPResNet(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test PPYOLOECSPResNet-P5 forward with widen_factor=0.25 + model = PPYOLOECSPResNet( + arch='P5', widen_factor=0.25, out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 32, 16, 16)) + assert feat[2].shape == torch.Size((1, 64, 8, 8)) + assert feat[3].shape == torch.Size((1, 128, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test PPYOLOECSPResNet forward with dict(type='ReLU') + model = PPYOLOECSPResNet( + widen_factor=0.125, + act_cfg=dict(type='ReLU'), + out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test PPYOLOECSPResNet with BatchNorm forward + model = PPYOLOECSPResNet(widen_factor=0.125, out_indices=range(0, 5)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test PPYOLOECSPResNet with BatchNorm forward + model = PPYOLOECSPResNet(plugins=[ + dict( + cfg=dict(type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 1 + assert len(model.stage2) == 1 + assert len(model.stage3) == 2 # +DropBlock + assert len(model.stage4) == 2 # +DropBlock + model.train() + imgs = torch.randn(1, 3, 256, 256) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 256, 32, 32)) + assert feat[1].shape == torch.Size((1, 512, 16, 16)) + assert feat[2].shape == torch.Size((1, 1024, 8, 8)) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_efficient_rep.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_efficient_rep.py new file mode 100644 index 0000000000000000000000000000000000000000..53af20294137b0d29a67e4f1946fe9fd79991f80 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_efficient_rep.py @@ -0,0 +1,202 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.models.backbones import YOLOv6CSPBep, YOLOv6EfficientRep +from mmyolo.utils import register_all_modules +from .utils import check_norm_state, is_norm + +register_all_modules() + + +class TestYOLOv6EfficientRep(TestCase): + + def test_init(self): + # out_indices in range(len(arch_setting) + 1) + with pytest.raises(AssertionError): + YOLOv6EfficientRep(out_indices=(6, )) + + with pytest.raises(ValueError): + # frozen_stages must in range(-1, len(arch_setting) + 1) + YOLOv6EfficientRep(frozen_stages=6) + + def test_YOLOv6EfficientRep_forward(self): + # Test YOLOv6EfficientRep with first stage frozen + frozen_stages = 1 + model = YOLOv6EfficientRep(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test YOLOv6EfficientRep with norm_eval=True + model = YOLOv6EfficientRep(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test YOLOv6EfficientRep-P5 forward with widen_factor=0.25 + model = YOLOv6EfficientRep( + arch='P5', widen_factor=0.25, out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 32, 16, 16)) + assert feat[2].shape == torch.Size((1, 64, 8, 8)) + assert feat[3].shape == torch.Size((1, 128, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test YOLOv6EfficientRep forward with dict(type='ReLU') + model = YOLOv6EfficientRep( + widen_factor=0.125, + act_cfg=dict(type='ReLU'), + out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv6EfficientRep with BatchNorm forward + model = YOLOv6EfficientRep(widen_factor=0.125, out_indices=range(0, 5)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv6EfficientRep with BatchNorm forward + model = YOLOv6EfficientRep(plugins=[ + dict( + cfg=dict(type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 1 + assert len(model.stage2) == 1 + assert len(model.stage3) == 2 # +DropBlock + assert len(model.stage4) == 3 # +SPPF+DropBlock + model.train() + imgs = torch.randn(1, 3, 256, 256) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 256, 32, 32)) + assert feat[1].shape == torch.Size((1, 512, 16, 16)) + assert feat[2].shape == torch.Size((1, 1024, 8, 8)) + + def test_YOLOv6CSPBep_forward(self): + # Test YOLOv6CSPBep with first stage frozen + frozen_stages = 1 + model = YOLOv6CSPBep(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test YOLOv6CSPBep with norm_eval=True + model = YOLOv6CSPBep(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test YOLOv6CSPBep forward with widen_factor=0.25 + model = YOLOv6CSPBep( + arch='P5', widen_factor=0.25, out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 32, 16, 16)) + assert feat[2].shape == torch.Size((1, 64, 8, 8)) + assert feat[3].shape == torch.Size((1, 128, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test YOLOv6CSPBep forward with dict(type='ReLU') + model = YOLOv6CSPBep( + widen_factor=0.125, + act_cfg=dict(type='ReLU'), + out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv6CSPBep with BatchNorm forward + model = YOLOv6CSPBep(widen_factor=0.125, out_indices=range(0, 5)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv6CSPBep with BatchNorm forward + model = YOLOv6CSPBep(plugins=[ + dict( + cfg=dict(type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 1 + assert len(model.stage2) == 1 + assert len(model.stage3) == 2 # +DropBlock + assert len(model.stage4) == 3 # +SPPF+DropBlock + model.train() + imgs = torch.randn(1, 3, 256, 256) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 256, 32, 32)) + assert feat[1].shape == torch.Size((1, 512, 16, 16)) + assert feat[2].shape == torch.Size((1, 1024, 8, 8)) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_yolov7_backbone.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_yolov7_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..76b40aa44b99ea1509be6768a6c4287652961ad0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/test_yolov7_backbone.py @@ -0,0 +1,154 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.models.backbones import YOLOv7Backbone +from mmyolo.utils import register_all_modules +from .utils import check_norm_state + +register_all_modules() + + +class TestYOLOv7Backbone(TestCase): + + def test_init(self): + # out_indices in range(len(arch_setting) + 1) + with pytest.raises(AssertionError): + YOLOv7Backbone(out_indices=(6, )) + + with pytest.raises(ValueError): + # frozen_stages must in range(-1, len(arch_setting) + 1) + YOLOv7Backbone(frozen_stages=6) + + def test_forward(self): + # Test YOLOv7Backbone-L with first stage frozen + frozen_stages = 1 + model = YOLOv7Backbone(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test YOLOv7Backbone-L with norm_eval=True + model = YOLOv7Backbone(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test YOLOv7Backbone-L forward with widen_factor=0.25 + model = YOLOv7Backbone( + widen_factor=0.25, out_indices=tuple(range(0, 5))) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 64, 16, 16)) + assert feat[2].shape == torch.Size((1, 128, 8, 8)) + assert feat[3].shape == torch.Size((1, 256, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test YOLOv7Backbone-L with plugins + model = YOLOv7Backbone( + widen_factor=0.25, + plugins=[ + dict( + cfg=dict( + type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 2 + assert len(model.stage2) == 2 + assert len(model.stage3) == 3 # +DropBlock + assert len(model.stage4) == 3 # +DropBlock + model.train() + imgs = torch.randn(1, 3, 128, 128) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 128, 16, 16)) + assert feat[1].shape == torch.Size((1, 256, 8, 8)) + assert feat[2].shape == torch.Size((1, 256, 4, 4)) + + # Test YOLOv7Backbone-X forward with widen_factor=0.25 + model = YOLOv7Backbone(arch='X', widen_factor=0.25) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 160, 8, 8)) + assert feat[1].shape == torch.Size((1, 320, 4, 4)) + assert feat[2].shape == torch.Size((1, 320, 2, 2)) + + # Test YOLOv7Backbone-tiny forward with widen_factor=0.25 + model = YOLOv7Backbone(arch='Tiny', widen_factor=0.25) + model.train() + + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 32, 8, 8)) + assert feat[1].shape == torch.Size((1, 64, 4, 4)) + assert feat[2].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv7Backbone-w forward with widen_factor=0.25 + model = YOLOv7Backbone( + arch='W', widen_factor=0.25, out_indices=(2, 3, 4, 5)) + model.train() + + imgs = torch.randn(1, 3, 128, 128) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 64, 16, 16)) + assert feat[1].shape == torch.Size((1, 128, 8, 8)) + assert feat[2].shape == torch.Size((1, 192, 4, 4)) + assert feat[3].shape == torch.Size((1, 256, 2, 2)) + + # Test YOLOv7Backbone-w forward with widen_factor=0.25 + model = YOLOv7Backbone( + arch='D', widen_factor=0.25, out_indices=(2, 3, 4, 5)) + model.train() + + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 96, 16, 16)) + assert feat[1].shape == torch.Size((1, 192, 8, 8)) + assert feat[2].shape == torch.Size((1, 288, 4, 4)) + assert feat[3].shape == torch.Size((1, 384, 2, 2)) + + # Test YOLOv7Backbone-w forward with widen_factor=0.25 + model = YOLOv7Backbone( + arch='E', widen_factor=0.25, out_indices=(2, 3, 4, 5)) + model.train() + + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 80, 16, 16)) + assert feat[1].shape == torch.Size((1, 160, 8, 8)) + assert feat[2].shape == torch.Size((1, 240, 4, 4)) + assert feat[3].shape == torch.Size((1, 320, 2, 2)) + + # Test YOLOv7Backbone-w forward with widen_factor=0.25 + model = YOLOv7Backbone( + arch='E2E', widen_factor=0.25, out_indices=(2, 3, 4, 5)) + model.train() + + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 80, 16, 16)) + assert feat[1].shape == torch.Size((1, 160, 8, 8)) + assert feat[2].shape == torch.Size((1, 240, 4, 4)) + assert feat[3].shape == torch.Size((1, 320, 2, 2)) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/utils.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d65db568d6f1693eb457dc74b0d8c417cef1b9ea --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_backbone/utils.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.models.backbones.res2net import Bottle2neck +from mmdet.models.backbones.resnet import BasicBlock, Bottleneck +from mmdet.models.backbones.resnext import Bottleneck as BottleneckX +from mmdet.models.layers import SimplifiedBasicBlock +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + + +def is_block(modules): + """Check if is ResNet building block.""" + if isinstance(modules, (BasicBlock, Bottleneck, BottleneckX, Bottle2neck, + SimplifiedBasicBlock)): + return True + return False + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_data_preprocessor/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_data_preprocessor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_data_preprocessor/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_data_preprocessor/test_data_preprocessor.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_data_preprocessor/test_data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..2c7e4415b627afe0046bc30b3b416af9deb302b6 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_data_preprocessor/test_data_preprocessor.py @@ -0,0 +1,156 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmdet.structures import DetDataSample +from mmengine import MessageHub + +from mmyolo.models import PPYOLOEBatchRandomResize, PPYOLOEDetDataPreprocessor +from mmyolo.models.data_preprocessors import (YOLOv5DetDataPreprocessor, + YOLOXBatchSyncRandomResize) +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv5DetDataPreprocessor(TestCase): + + def test_forward(self): + processor = YOLOv5DetDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1]) + + data = { + 'inputs': [torch.randint(0, 256, (3, 11, 10))], + 'data_samples': [DetDataSample()] + } + out_data = processor(data, training=False) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + + self.assertEqual(batch_inputs.shape, (1, 3, 11, 10)) + self.assertEqual(len(batch_data_samples), 1) + + # test channel_conversion + processor = YOLOv5DetDataPreprocessor( + mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True) + out_data = processor(data, training=False) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + self.assertEqual(batch_inputs.shape, (1, 3, 11, 10)) + self.assertEqual(len(batch_data_samples), 1) + + # test padding, training=False + data = { + 'inputs': [ + torch.randint(0, 256, (3, 10, 11)), + torch.randint(0, 256, (3, 9, 14)) + ] + } + processor = YOLOv5DetDataPreprocessor( + mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True) + out_data = processor(data, training=False) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + self.assertEqual(batch_inputs.shape, (2, 3, 10, 14)) + self.assertIsNone(batch_data_samples) + + # test training + data = { + 'inputs': torch.randint(0, 256, (2, 3, 10, 11)), + 'data_samples': { + 'bboxes_labels': torch.randint(0, 11, (18, 6)) + }, + } + out_data = processor(data, training=True) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + self.assertIn('img_metas', batch_data_samples) + self.assertIn('bboxes_labels', batch_data_samples) + self.assertEqual(batch_inputs.shape, (2, 3, 10, 11)) + self.assertIsInstance(batch_data_samples['bboxes_labels'], + torch.Tensor) + self.assertIsInstance(batch_data_samples['img_metas'], list) + + data = { + 'inputs': [torch.randint(0, 256, (3, 11, 10))], + 'data_samples': [DetDataSample()] + } + # data_samples must be dict + with self.assertRaises(AssertionError): + processor(data, training=True) + + +class TestPPYOLOEDetDataPreprocessor(TestCase): + + def test_batch_random_resize(self): + processor = PPYOLOEDetDataPreprocessor( + pad_size_divisor=32, + batch_augments=[ + dict( + type='PPYOLOEBatchRandomResize', + random_size_range=(320, 480), + interval=1, + size_divisor=32, + random_interp=True, + keep_ratio=False) + ], + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True) + self.assertTrue( + isinstance(processor.batch_augments[0], PPYOLOEBatchRandomResize)) + message_hub = MessageHub.get_instance('test_batch_random_resize') + message_hub.update_info('iter', 0) + + # test training + data = { + 'inputs': [ + torch.randint(0, 256, (3, 10, 11)), + torch.randint(0, 256, (3, 10, 11)) + ], + 'data_samples': { + 'bboxes_labels': torch.randint(0, 11, (18, 6)).float() + }, + } + out_data = processor(data, training=True) + batch_data_samples = out_data['data_samples'] + self.assertIn('img_metas', batch_data_samples) + self.assertIn('bboxes_labels', batch_data_samples) + self.assertIsInstance(batch_data_samples['bboxes_labels'], + torch.Tensor) + self.assertIsInstance(batch_data_samples['img_metas'], list) + + data = { + 'inputs': [torch.randint(0, 256, (3, 11, 10))], + 'data_samples': DetDataSample() + } + # data_samples must be list + with self.assertRaises(AssertionError): + processor(data, training=True) + + +class TestYOLOXDetDataPreprocessor(TestCase): + + def test_batch_sync_random_size(self): + processor = YOLOXBatchSyncRandomResize( + random_size_range=(480, 800), size_divisor=32, interval=1) + self.assertTrue(isinstance(processor, YOLOXBatchSyncRandomResize)) + message_hub = MessageHub.get_instance( + 'test_yolox_batch_sync_random_resize') + message_hub.update_info('iter', 0) + + # test training + inputs = torch.randint(0, 256, (4, 3, 10, 11)) + data_samples = {'bboxes_labels': torch.randint(0, 11, (18, 6)).float()} + + inputs, data_samples = processor(inputs, data_samples) + + self.assertIn('bboxes_labels', data_samples) + self.assertIsInstance(data_samples['bboxes_labels'], torch.Tensor) + self.assertIsInstance(inputs, torch.Tensor) + + inputs = torch.randint(0, 256, (4, 3, 10, 11)) + data_samples = DetDataSample() + + # data_samples must be dict + with self.assertRaises(AssertionError): + processor(inputs, data_samples) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_ppyoloe_head.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_ppyoloe_head.py new file mode 100644 index 0000000000000000000000000000000000000000..20e0c45761454f3575856babe39fa3fc95e6d5fa --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_ppyoloe_head.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine import ConfigDict, MessageHub +from mmengine.config import Config +from mmengine.model import bias_init_with_prob +from mmengine.testing import assert_allclose + +from mmyolo.models import PPYOLOEHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestPPYOLOEHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='PPYOLOEHeadModule', + num_classes=4, + in_channels=[32, 64, 128], + featmap_strides=(8, 16, 32)) + + def test_init_weights(self): + head = PPYOLOEHead(head_module=self.head_module) + head.head_module.init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_reg in zip(head.head_module.cls_preds, + head.head_module.reg_preds): + assert_allclose(conv_cls.weight.data, + torch.zeros_like(conv_cls.weight.data)) + assert_allclose(conv_reg.weight.data, + torch.zeros_like(conv_reg.weight.data)) + + assert_allclose(conv_cls.bias.data, + torch.ones_like(conv_cls.bias.data) * bias_init) + assert_allclose(conv_reg.bias.data, + torch.ones_like(conv_reg.bias.data)) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + nms_pre=1000, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.7), + max_per_img=300)) + + head = PPYOLOEHead(head_module=self.head_module, test_cfg=test_cfg) + head.eval() + feat = [ + torch.rand(1, in_channels, s // feat_size, s // feat_size) + for in_channels, feat_size in [[32, 8], [64, 16], [128, 32]] + ] + cls_scores, bbox_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + message_hub = MessageHub.get_instance('test_ppyoloe_loss_by_feat') + message_hub.update_info('epoch', 1) + + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = PPYOLOEHead( + head_module=self.head_module, + train_cfg=ConfigDict( + initial_epoch=31, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=4, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=4, + topk=13, + alpha=1, + beta=6))) + head.train() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, bbox_dist_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = torch.empty((0, 6), dtype=torch.float32) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_dfl_loss = empty_gt_losses['loss_dfl'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertEqual( + empty_dfl_loss.item(), 0, + 'there should be df loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = PPYOLOEHead( + head_module=self.head_module, + train_cfg=ConfigDict( + initial_epoch=31, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=4, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=4, + topk=13, + alpha=1, + beta=6))) + head.train() + gt_instances = torch.Tensor( + [[0., 0., 23.6667, 23.8757, 238.6326, 151.8874]]) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_loss_dfl = one_gt_losses['loss_dfl'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_loss_dfl.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = PPYOLOEHead( + head_module=self.head_module, + train_cfg=ConfigDict( + initial_epoch=31, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=1, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=1, + topk=13, + alpha=1, + beta=6))) + head.train() + gt_instances = torch.Tensor( + [[0., 0., 23.6667, 23.8757, 238.6326, 151.8874]]) + cls_scores, bbox_preds, bbox_dist_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_loss_dfl = one_gt_losses['loss_dfl'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_loss_dfl.item(), 0, + 'obj loss should be non-zero') diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_rotated_rtmdet_head.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_rotated_rtmdet_head.py new file mode 100644 index 0000000000000000000000000000000000000000..21e1d4d139a2cbf2815f69ffac105100bcd62f34 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_rotated_rtmdet_head.py @@ -0,0 +1,264 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from mmengine.config import Config +from mmengine.structures import InstanceData + +from mmyolo.models.dense_heads import RTMDetRotatedHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestRTMDetRotatedHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='RTMDetRotatedSepBNHeadModule', + num_classes=4, + in_channels=1, + stacked_convs=1, + feat_channels=64, + featmap_strides=[4, 8, 16]) + + def test_init_weights(self): + head = RTMDetRotatedHead(head_module=self.head_module) + head.head_module.init_weights() + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = dict( + multi_label=True, + decode_with_angle=True, + nms_pre=2000, + score_thr=0.01, + nms=dict(type='nms_rotated', iou_threshold=0.1), + max_per_img=300) + test_cfg = Config(test_cfg) + + head = RTMDetRotatedHead( + head_module=self.head_module, test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, angle_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + angle_preds, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + angle_preds, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + train_cfg = dict( + assigner=dict( + type='BatchDynamicSoftLabelAssigner', + num_classes=80, + topk=13, + iou_calculator=dict(type='mmrotate.RBboxOverlaps2D'), + batch_iou=False), + allowed_border=-1, + pos_weight=-1, + debug=False) + train_cfg = Config(train_cfg) + head = RTMDetRotatedHead( + head_module=self.head_module, train_cfg=train_cfg).cuda() + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size).cuda() + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, angle_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 5)).cuda(), + labels=torch.LongTensor([]).cuda()) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + angle_preds, [gt_instances], + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'classification loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = RTMDetRotatedHead( + head_module=self.head_module, train_cfg=train_cfg).cuda() + gt_instances = InstanceData( + bboxes=torch.Tensor([[130.6667, 86.8757, 100.6326, 70.8874, + 0.2]]).cuda(), + labels=torch.LongTensor([1]).cuda()) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, angle_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = RTMDetRotatedHead( + head_module=self.head_module, train_cfg=train_cfg).cuda() + gt_instances = InstanceData( + bboxes=torch.Tensor([[130.6667, 86.8757, 100.6326, 70.8874, + 0.2]]).cuda(), + labels=torch.LongTensor([0]).cuda()) + + cls_scores, bbox_preds, angle_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, angle_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + + def test_hbb_loss_by_feat(self): + + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + train_cfg = dict( + assigner=dict( + type='BatchDynamicSoftLabelAssigner', + num_classes=80, + topk=13, + iou_calculator=dict(type='mmrotate.RBboxOverlaps2D'), + batch_iou=False), + allowed_border=-1, + pos_weight=-1, + debug=False) + train_cfg = Config(train_cfg) + hbb_cfg = dict( + bbox_coder=dict( + type='DistanceAnglePointCoder', angle_version='le90'), + loss_bbox=dict(type='mmdet.GIoULoss', loss_weight=2.0), + angle_coder=dict( + type='mmrotate.CSLCoder', + angle_version='le90', + omega=1, + window='gaussian', + radius=1), + loss_angle=dict( + type='mmrotate.SmoothFocalLoss', + gamma=2.0, + alpha=0.25, + loss_weight=0.2), + use_hbbox_loss=True, + ) + head = RTMDetRotatedHead( + head_module=self.head_module, **hbb_cfg, train_cfg=train_cfg) + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, angle_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 5)), labels=torch.LongTensor([])) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + angle_preds, [gt_instances], + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_angle_loss = empty_gt_losses['loss_angle'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'classification loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertEqual( + empty_angle_loss.item(), 0, + 'there should be no angle loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = RTMDetRotatedHead( + head_module=self.head_module, **hbb_cfg, train_cfg=train_cfg) + gt_instances = InstanceData( + bboxes=torch.Tensor([[130.6667, 86.8757, 100.6326, 70.8874, 0.2]]), + labels=torch.LongTensor([1])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, angle_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_angle_loss = one_gt_losses['loss_angle'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_angle_loss.item(), 0, + 'angle loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = RTMDetRotatedHead( + head_module=self.head_module, **hbb_cfg, train_cfg=train_cfg) + gt_instances = InstanceData( + bboxes=torch.Tensor([[130.6667, 86.8757, 100.6326, 70.8874, 0.2]]), + labels=torch.LongTensor([0])) + + cls_scores, bbox_preds, angle_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, angle_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_angle_loss = one_gt_losses['loss_angle'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_angle_loss.item(), 0, + 'angle loss should be non-zero') diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_rtmdet_head.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_rtmdet_head.py new file mode 100644 index 0000000000000000000000000000000000000000..cce5ee6ffae5c697b32430b9b13cab16127450bb --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_rtmdet_head.py @@ -0,0 +1,223 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np +import torch +from mmengine.config import Config +from mmengine.structures import InstanceData + +from mmyolo.models import RTMDetInsSepBNHead +from mmyolo.models.dense_heads import RTMDetHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestRTMDetHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='RTMDetSepBNHeadModule', + num_classes=4, + in_channels=1, + stacked_convs=1, + feat_channels=64, + featmap_strides=[4, 8, 16]) + + def test_init_weights(self): + head = RTMDetHead(head_module=self.head_module) + head.head_module.init_weights() + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = dict( + multi_label=True, + nms_pre=30000, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=300) + test_cfg = Config(test_cfg) + + head = RTMDetHead(head_module=self.head_module, test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + train_cfg = dict( + assigner=dict( + num_classes=80, + type='BatchDynamicSoftLabelAssigner', + topk=13, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + allowed_border=-1, + pos_weight=-1, + debug=False) + train_cfg = Config(train_cfg) + head = RTMDetHead(head_module=self.head_module, train_cfg=train_cfg) + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + [gt_instances], img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'classification loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = RTMDetHead(head_module=self.head_module, train_cfg=train_cfg) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([1])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = RTMDetHead(head_module=self.head_module, train_cfg=train_cfg) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([0])) + + cls_scores, bbox_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + + +class TestRTMDetInsHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='RTMDetInsSepBNHeadModule', + num_classes=4, + in_channels=1, + stacked_convs=1, + feat_channels=64, + featmap_strides=[4, 8, 16], + num_prototypes=8, + dyconv_channels=8, + num_dyconvs=3, + share_conv=True, + use_sigmoid_cls=True) + + def test_init_weights(self): + head = RTMDetInsSepBNHead(head_module=self.head_module) + head.head_module.init_weights() + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + 'pad_param': np.array([0., 0., 0., 0.]) + }] + test_cfg = dict( + multi_label=False, + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100, + mask_thr_binary=0.5) + test_cfg = Config(test_cfg) + + head = RTMDetInsSepBNHead( + head_module=self.head_module, test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, kernel_preds, mask_feat = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + kernel_preds, + mask_feat, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + + img_metas_without_pad_param = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0) + }] + head.predict_by_feat( + cls_scores, + bbox_preds, + kernel_preds, + mask_feat, + batch_img_metas=img_metas_without_pad_param, + cfg=test_cfg, + rescale=True, + with_nms=True) + + with self.assertRaises(AssertionError): + head.predict_by_feat( + cls_scores, + bbox_preds, + kernel_preds, + mask_feat, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov5_head.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov5_head.py new file mode 100644 index 0000000000000000000000000000000000000000..974b9a9869dbcf39e6928cadd7399b452ba93e1d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov5_head.py @@ -0,0 +1,411 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np +import torch +from mmengine.config import Config +from mmengine.structures import InstanceData + +from mmyolo.models.dense_heads import YOLOv5Head, YOLOv5InsHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv5Head(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv5HeadModule', + num_classes=2, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32], + num_base_priors=3) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOv5Head(head_module=self.head_module, test_cfg=test_cfg) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv5Head(head_module=self.head_module) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, [gt_instances], + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOv5Head(head_module=self.head_module) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([1])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv5Head(head_module=self.head_module) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([0])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertEqual(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + def test_loss_by_feat_with_ignore(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv5Head(head_module=self.head_module, ignore_iof_thr=0.8) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) + # ignore boxes + gt_instances_ignore = torch.tensor( + [[0, 0, 69.7688, 0, 619.3611, 62.2711]], dtype=torch.float32) + + empty_gt_losses = head._loss_by_feat_with_ignore( + cls_scores, bbox_preds, objectnesses, [gt_instances], img_metas, + gt_instances_ignore) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOv5Head(head_module=self.head_module, ignore_iof_thr=0.8) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([1])) + + gt_instances_ignore = torch.tensor( + [[0, 0, 69.7688, 0, 619.3611, 62.2711]], dtype=torch.float32) + + one_gt_losses = head._loss_by_feat_with_ignore(cls_scores, bbox_preds, + objectnesses, + [gt_instances], + img_metas, + gt_instances_ignore) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv5Head(head_module=self.head_module, ignore_iof_thr=0.8) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([0])) + + gt_instances_ignore = torch.tensor( + [[0, 0, 69.7688, 0, 619.3611, 62.2711]], dtype=torch.float32) + + one_gt_losses = head._loss_by_feat_with_ignore(cls_scores, bbox_preds, + objectnesses, + [gt_instances], + img_metas, + gt_instances_ignore) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertEqual(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + +class TestYOLOv5InsHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv5InsHeadModule', + num_classes=4, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32], + mask_channels=32, + proto_channels=32, + widen_factor=1.0) + + def test_init_weights(self): + head = YOLOv5InsHead(head_module=self.head_module) + head.head_module.init_weights() + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + nms_pre=30000, + min_bbox_size=0, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=300, + mask_thr_binary=0.5)) + + head = YOLOv5InsHead(head_module=self.head_module, test_cfg=test_cfg) + head.eval() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + with torch.no_grad(): + res = head.forward(feat) + cls_scores, bbox_preds, objectnesses,\ + coeff_preds, proto_preds = res + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + coeff_preds, + proto_preds, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + + with self.assertRaises(AssertionError): + head.predict_by_feat( + cls_scores, + bbox_preds, + coeff_preds, + proto_preds, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv5InsHead(head_module=self.head_module) + rng = np.random.RandomState(0) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses,\ + coeff_preds, proto_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_bboxes_labels = torch.empty((0, 6)) + gt_masks = rng.rand(0, s // 4, s // 4) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, coeff_preds, + proto_preds, gt_bboxes_labels, + gt_masks, img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + empty_mask_loss = empty_gt_losses['loss_mask'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + self.assertEqual( + empty_mask_loss.item(), 0, + 'there should be no mask loss when there are no true masks') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOv5InsHead(head_module=self.head_module) + + bboxes = torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]) + labels = torch.Tensor([1.]) + batch_id = torch.LongTensor([0]) + gt_bboxes_labels = torch.cat([batch_id[None], labels[None], bboxes], + dim=1) + gt_masks = torch.from_numpy(rng.rand(1, s // 4, s // 4)).int() + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + coeff_preds, proto_preds, + gt_bboxes_labels, gt_masks, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_mask_loss = one_gt_losses['loss_mask'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_mask_loss.item(), 0, + 'mask loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv5InsHead(head_module=self.head_module) + bboxes = torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]) + labels = torch.Tensor([1.]) + batch_id = torch.LongTensor([0]) + gt_bboxes_labels = torch.cat([batch_id[None], labels[None], bboxes], + dim=1) + gt_masks = torch.from_numpy(rng.rand(1, s // 4, s // 4)).int() + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + coeff_preds, proto_preds, + gt_bboxes_labels, gt_masks, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_mask_loss = one_gt_losses['loss_mask'].sum() + self.assertEqual(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_mask_loss.item(), 0, + 'mask loss should be non-zero') diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov6_head.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov6_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5bb951d12360614b26b5d3ccf30d1c044ab0ccdc --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov6_head.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine.config import Config + +from mmyolo.models.dense_heads import YOLOv6Head +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv6Head(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv6HeadModule', + num_classes=2, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32]) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOv6Head(head_module=self.head_module, test_cfg=test_cfg) + head.eval() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov7_head.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov7_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5033f97e19673af79ab9a9c3ee2c618db3ea80e0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov7_head.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine.config import Config +from mmengine.structures import InstanceData + +from mmyolo.models.dense_heads import YOLOv7Head +from mmyolo.utils import register_all_modules + +register_all_modules() + + +# TODO: Test YOLOv7p6HeadModule +class TestYOLOv7Head(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv7HeadModule', + num_classes=2, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32], + num_base_priors=3) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOv7Head(head_module=self.head_module, test_cfg=test_cfg) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv7Head(head_module=self.head_module) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, [gt_instances], + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOv7Head(head_module=self.head_module) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([1])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv7Head(head_module=self.head_module) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([0])) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertEqual(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov8_head.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov8_head.py new file mode 100644 index 0000000000000000000000000000000000000000..8980387a75bdd4ac1d3aebacf8a364e82259a01b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov8_head.py @@ -0,0 +1,161 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine import ConfigDict +from mmengine.config import Config + +from mmyolo.models import YOLOv8Head +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv8Head(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv8HeadModule', + num_classes=4, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32]) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOv8Head(head_module=self.head_module, test_cfg=test_cfg) + head.eval() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv8Head( + head_module=self.head_module, + train_cfg=ConfigDict( + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=4, + topk=10, + alpha=0.5, + beta=6))) + head.train() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, bbox_dist_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = torch.empty((0, 6), dtype=torch.float32) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_dfl_loss = empty_gt_losses['loss_dfl'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertEqual( + empty_dfl_loss.item(), 0, + 'there should be df loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + gt_instances = torch.Tensor( + [[0., 0., 23.6667, 23.8757, 238.6326, 151.8874]]) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_loss_dfl = one_gt_losses['loss_dfl'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_loss_dfl.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv8Head( + head_module=self.head_module, + train_cfg=ConfigDict( + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=1, + topk=10, + alpha=0.5, + beta=6))) + head.train() + + gt_instances = torch.Tensor( + [[0., 0., 23.6667, 23.8757, 238.6326, 151.8874], + [1., 0., 24.6667, 27.8757, 28.6326, 51.8874]]) + cls_scores, bbox_preds, bbox_dist_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_loss_dfl = one_gt_losses['loss_dfl'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_loss_dfl.item(), 0, + 'obj loss should be non-zero') diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolox_head.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolox_head.py new file mode 100644 index 0000000000000000000000000000000000000000..390994417c7fc9c0b2cb4470484ee3e28248a4a5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolox_head.py @@ -0,0 +1,379 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine.config import Config +from mmengine.model import bias_init_with_prob +from mmengine.testing import assert_allclose + +from mmyolo.models.dense_heads import YOLOXHead, YOLOXPoseHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOXHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOXHeadModule', + num_classes=4, + in_channels=1, + stacked_convs=1, + ) + + def test_init_weights(self): + head = YOLOXHead(head_module=self.head_module) + head.head_module.init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_obj in zip(head.head_module.multi_level_conv_cls, + head.head_module.multi_level_conv_obj): + assert_allclose(conv_cls.bias.data, + torch.ones_like(conv_cls.bias.data) * bias_init) + assert_allclose(conv_obj.bias.data, + torch.ones_like(conv_obj.bias.data) * bias_init) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOXHead(head_module=self.head_module, test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, objectnesses = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'scale_factor': 1, + }] + train_cfg = Config( + dict( + assigner=dict( + type='mmdet.SimOTAAssigner', + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + center_radius=2.5, + candidate_topk=10, + iou_weight=3.0, + cls_weight=1.0))) + + head = YOLOXHead(head_module=self.head_module, train_cfg=train_cfg) + assert not head.use_bbox_aux + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = torch.empty((0, 6)) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, gt_instances, + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOXHead(head_module=self.head_module, train_cfg=train_cfg) + head.use_bbox_aux = True + gt_instances = torch.Tensor( + [[0, 2, 23.6667, 23.8757, 238.6326, 151.8874]]) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + gt_instances, img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_l1_loss = one_gt_losses['loss_bbox_aux'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_l1_loss.item(), 0, + 'l1 loss should be non-zero') + + # Test groud truth out of bound + gt_instances = torch.Tensor( + [[0, 2, s * 4, s * 4, s * 4 + 10, s * 4 + 10]]) + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, gt_instances, + img_metas) + # When gt_bboxes out of bound, the assign results should be empty, + # so the cls and bbox loss should be zero. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when gt_bboxes out of bound') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when gt_bboxes out of bound') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + +class TestYOLOXPoseHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOXPoseHeadModule', + num_classes=1, + num_keypoints=17, + in_channels=1, + stacked_convs=1, + ) + self.train_cfg = Config( + dict( + assigner=dict( + type='PoseSimOTAAssigner', + center_radius=2.5, + oks_weight=3.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + oks_calculator=dict( + type='OksLoss', + metainfo='configs/_base_/pose/coco.py')))) + self.loss_pose = Config( + dict( + type='OksLoss', + metainfo='configs/_base_/pose/coco.py', + loss_weight=30.0)) + + def test_init_weights(self): + head = YOLOXPoseHead( + head_module=self.head_module, + loss_pose=self.loss_pose, + train_cfg=self.train_cfg) + head.head_module.init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_obj, conv_vis in zip( + head.head_module.multi_level_conv_cls, + head.head_module.multi_level_conv_obj, + head.head_module.multi_level_conv_vis): + assert_allclose(conv_cls.bias.data, + torch.ones_like(conv_cls.bias.data) * bias_init) + assert_allclose(conv_obj.bias.data, + torch.ones_like(conv_obj.bias.data) * bias_init) + assert_allclose(conv_vis.bias.data, + torch.ones_like(conv_vis.bias.data) * bias_init) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOXPoseHead( + head_module=self.head_module, + loss_pose=self.loss_pose, + train_cfg=self.train_cfg, + test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, objectnesses, \ + offsets_preds, vis_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + offsets_preds, + vis_preds, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'scale_factor': 1, + }] + + head = YOLOXPoseHead( + head_module=self.head_module, + loss_pose=self.loss_pose, + train_cfg=self.train_cfg) + assert not head.use_bbox_aux + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, objectnesses, \ + offsets_preds, vis_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = torch.empty((0, 6)) + gt_keypoints = torch.empty((0, 17, 2)) + gt_keypoints_visible = torch.empty((0, 17)) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, offsets_preds, + vis_preds, gt_instances, + gt_keypoints, gt_keypoints_visible, + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + empty_loss_kpt = empty_gt_losses['loss_kpt'].sum() + empty_loss_vis = empty_gt_losses['loss_vis'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + self.assertEqual( + empty_loss_kpt.item(), 0, + 'there should be no kpt loss when there are no true keypoints') + self.assertEqual( + empty_loss_vis.item(), 0, + 'there should be no vis loss when there are no true keypoints') + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOXPoseHead( + head_module=self.head_module, + loss_pose=self.loss_pose, + train_cfg=self.train_cfg) + gt_instances = torch.Tensor( + [[0, 0, 23.6667, 23.8757, 238.6326, 151.8874]]) + gt_keypoints = torch.Tensor([[[317.1519, + 429.8433], [338.3080, 416.9187], + [298.9951, + 403.8911], [102.7025, 273.1329], + [255.4321, + 404.8712], [400.0422, 554.4373], + [167.7857, + 516.7591], [397.4943, 737.4575], + [116.3247, + 674.5684], [102.7025, 273.1329], + [66.0319, + 808.6383], [102.7025, 273.1329], + [157.6150, + 819.1249], [102.7025, 273.1329], + [102.7025, + 273.1329], [102.7025, 273.1329], + [102.7025, 273.1329]]]) + gt_keypoints_visible = torch.Tensor([[ + 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. + ]]) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + offsets_preds, vis_preds, + gt_instances, gt_keypoints, + gt_keypoints_visible, img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_loss_kpt = one_gt_losses['loss_kpt'].sum() + onegt_loss_vis = one_gt_losses['loss_vis'].sum() + + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_loss_kpt.item(), 0, + 'kpt loss should be non-zero') + self.assertGreater(onegt_loss_vis.item(), 0, + 'vis loss should be non-zero') + + # Test groud truth out of bound + gt_instances = torch.Tensor( + [[0, 2, s * 4, s * 4, s * 4 + 10, s * 4 + 10]]) + gt_keypoints = torch.Tensor([[[s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10]]]) + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, offsets_preds, + vis_preds, gt_instances, + gt_keypoints, gt_keypoints_visible, + img_metas) + # When gt_bboxes out of bound, the assign results should be empty, + # so the cls and bbox loss should be zero. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + empty_kpt_loss = empty_gt_losses['loss_kpt'].sum() + empty_vis_loss = empty_gt_losses['loss_vis'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when gt_bboxes out of bound') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when gt_bboxes out of bound') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + self.assertEqual(empty_kpt_loss.item(), 0, + 'kps loss should be non-zero') + self.assertEqual(empty_vis_loss.item(), 0, + 'vis loss should be non-zero') diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_detectors/test_yolo_detector.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_detectors/test_yolo_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..4b2952040d193781a6d042976c336485232e1a0a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_detectors/test_yolo_detector.py @@ -0,0 +1,137 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import time +import unittest +from unittest import TestCase + +import torch +from mmdet.structures import DetDataSample +from mmdet.testing import demo_mm_inputs +from mmengine.logging import MessageHub +from parameterized import parameterized + +from mmyolo.testing import get_detector_cfg +from mmyolo.utils import register_all_modules + + +class TestSingleStageDetector(TestCase): + + def setUp(self): + register_all_modules() + + @parameterized.expand([ + 'yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py', + 'yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py', + 'yolox/yolox_tiny_fast_8xb8-300e_coco.py', + 'rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py', + 'yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py', + 'yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py' + ]) + def test_init(self, cfg_file): + model = get_detector_cfg(cfg_file) + model.backbone.init_cfg = None + + from mmyolo.registry import MODELS + detector = MODELS.build(model) + self.assertTrue(detector.backbone) + self.assertTrue(detector.neck) + self.assertTrue(detector.bbox_head) + + @parameterized.expand([ + ('yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py', ('cuda', 'cpu')), + ('yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py', ('cuda', 'cpu')), + ('rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py', ('cuda', 'cpu')), + ('yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py', ('cuda', 'cpu')) + ]) + def test_forward_loss_mode(self, cfg_file, devices): + message_hub = MessageHub.get_instance( + f'test_single_stage_forward_loss_mode-{time.time()}') + message_hub.update_info('iter', 0) + message_hub.update_info('epoch', 0) + model = get_detector_cfg(cfg_file) + model.backbone.init_cfg = None + + if 'fast' in cfg_file: + model.data_preprocessor = dict( + type='mmdet.DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True) + + from mmyolo.registry import MODELS + assert all([device in ['cpu', 'cuda'] for device in devices]) + + for device in devices: + detector = MODELS.build(model) + detector.init_weights() + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + detector = detector.cuda() + + packed_inputs = demo_mm_inputs(2, [[3, 320, 128], [3, 125, 320]]) + data = detector.data_preprocessor(packed_inputs, True) + losses = detector.forward(**data, mode='loss') + self.assertIsInstance(losses, dict) + + @parameterized.expand([ + ('yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py', ('cuda', + 'cpu')), + ('yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py', ('cuda', 'cpu')), + ('yolox/yolox_tiny_fast_8xb8-300e_coco.py', ('cuda', 'cpu')), + ('yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py', ('cuda', 'cpu')), + ('rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py', ('cuda', 'cpu')), + ('yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py', ('cuda', 'cpu')) + ]) + def test_forward_predict_mode(self, cfg_file, devices): + model = get_detector_cfg(cfg_file) + model.backbone.init_cfg = None + + from mmyolo.registry import MODELS + assert all([device in ['cpu', 'cuda'] for device in devices]) + + for device in devices: + detector = MODELS.build(model) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + detector = detector.cuda() + + packed_inputs = demo_mm_inputs(2, [[3, 320, 128], [3, 125, 320]]) + data = detector.data_preprocessor(packed_inputs, False) + # Test forward test + detector.eval() + with torch.no_grad(): + batch_results = detector.forward(**data, mode='predict') + self.assertEqual(len(batch_results), 2) + self.assertIsInstance(batch_results[0], DetDataSample) + + @parameterized.expand([ + ('yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py', ('cuda', + 'cpu')), + ('yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py', ('cuda', 'cpu')), + ('yolox/yolox_tiny_fast_8xb8-300e_coco.py', ('cuda', 'cpu')), + ('yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py', ('cuda', 'cpu')), + ('rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py', ('cuda', 'cpu')), + ('yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py', ('cuda', 'cpu')) + ]) + def test_forward_tensor_mode(self, cfg_file, devices): + model = get_detector_cfg(cfg_file) + model.backbone.init_cfg = None + + from mmyolo.registry import MODELS + assert all([device in ['cpu', 'cuda'] for device in devices]) + + for device in devices: + detector = MODELS.build(model) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + detector = detector.cuda() + + packed_inputs = demo_mm_inputs(2, [[3, 320, 128], [3, 125, 320]]) + data = detector.data_preprocessor(packed_inputs, False) + batch_results = detector.forward(**data, mode='tensor') + self.assertIsInstance(batch_results, tuple) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_layers/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_layers/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_layers/test_ema.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_layers/test_ema.py new file mode 100644 index 0000000000000000000000000000000000000000..b35838280ee5bc09d7c82b451f72468b53f5583f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_layers/test_ema.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import itertools +import math +from unittest import TestCase + +import torch +import torch.nn as nn +from mmengine.testing import assert_allclose + +from mmyolo.models.layers import ExpMomentumEMA + + +class TestEMA(TestCase): + + def test_exp_momentum_ema(self): + model = nn.Sequential(nn.Conv2d(1, 5, kernel_size=3), nn.Linear(5, 10)) + # Test invalid gamma + with self.assertRaisesRegex(AssertionError, + 'gamma must be greater than 0'): + ExpMomentumEMA(model, gamma=-1) + + # Test EMA + model = torch.nn.Sequential( + torch.nn.Conv2d(1, 5, kernel_size=3), torch.nn.Linear(5, 10)) + momentum = 0.1 + gamma = 4 + + ema_model = ExpMomentumEMA(model, momentum=momentum, gamma=gamma) + averaged_params = [ + torch.zeros_like(param) for param in model.parameters() + ] + n_updates = 10 + for i in range(n_updates): + updated_averaged_params = [] + for p, p_avg in zip(model.parameters(), averaged_params): + p.detach().add_(torch.randn_like(p)) + if i == 0: + updated_averaged_params.append(p.clone()) + else: + m = (1 - momentum) * math.exp(-(1 + i) / gamma) + momentum + updated_averaged_params.append( + (p_avg * (1 - m) + p * m).clone()) + ema_model.update_parameters(model) + averaged_params = updated_averaged_params + + for p_target, p_ema in zip(averaged_params, ema_model.parameters()): + assert_allclose(p_target, p_ema) + + def test_exp_momentum_ema_update_buffer(self): + model = nn.Sequential( + nn.Conv2d(1, 5, kernel_size=3), nn.BatchNorm2d(5, momentum=0.3), + nn.Linear(5, 10)) + # Test invalid gamma + with self.assertRaisesRegex(AssertionError, + 'gamma must be greater than 0'): + ExpMomentumEMA(model, gamma=-1) + + # Test EMA with momentum annealing. + momentum = 0.1 + gamma = 4 + + ema_model = ExpMomentumEMA( + model, gamma=gamma, momentum=momentum, update_buffers=True) + averaged_params = [ + torch.zeros_like(param) + for param in itertools.chain(model.parameters(), model.buffers()) + if param.size() != torch.Size([]) + ] + n_updates = 10 + for i in range(n_updates): + updated_averaged_params = [] + params = [ + param for param in itertools.chain(model.parameters(), + model.buffers()) + if param.size() != torch.Size([]) + ] + for p, p_avg in zip(params, averaged_params): + p.detach().add_(torch.randn_like(p)) + if i == 0: + updated_averaged_params.append(p.clone()) + else: + m = (1 - momentum) * math.exp(-(1 + i) / gamma) + momentum + updated_averaged_params.append( + (p_avg * (1 - m) + p * m).clone()) + ema_model.update_parameters(model) + averaged_params = updated_averaged_params + + ema_params = [ + param for param in itertools.chain(ema_model.module.parameters(), + ema_model.module.buffers()) + if param.size() != torch.Size([]) + ] + for p_target, p_ema in zip(averaged_params, ema_params): + assert_allclose(p_target, p_ema) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_layers/test_yolo_bricks.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_layers/test_yolo_bricks.py new file mode 100644 index 0000000000000000000000000000000000000000..5331a4e013c797052ed003b64b477d24ad10444c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_layers/test_yolo_bricks.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from unittest import TestCase + +import torch + +from mmyolo.models.layers import SPPFBottleneck +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestSPPFBottleneck(TestCase): + + def test_forward(self): + input_tensor = torch.randn((1, 3, 20, 20)) + bottleneck = SPPFBottleneck(3, 16) + out_tensor = bottleneck(input_tensor) + self.assertEqual(out_tensor.shape, (1, 16, 20, 20)) + + bottleneck = SPPFBottleneck(3, 16, kernel_sizes=[3, 5, 7]) + out_tensor = bottleneck(input_tensor) + self.assertEqual(out_tensor.shape, (1, 16, 20, 20)) + + # set len(kernel_sizes)=4 + bottleneck = SPPFBottleneck(3, 16, kernel_sizes=[3, 5, 7, 9]) + out_tensor = bottleneck(input_tensor) + self.assertEqual(out_tensor.shape, (1, 16, 20, 20)) + + # set use_conv_first=False + bottleneck = SPPFBottleneck( + 3, 16, use_conv_first=False, kernel_sizes=[3, 5, 7, 9]) + out_tensor = bottleneck(input_tensor) + self.assertEqual(out_tensor.shape, (1, 16, 20, 20)) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_cspnext_pafpn.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_cspnext_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..b26c99aa3c90c9e53be6ef7f8f28c4996c49ca2f --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_cspnext_pafpn.py @@ -0,0 +1,37 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.necks import CSPNeXtPAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestCSPNeXtPAFPN(TestCase): + + def test_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = 24 + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = CSPNeXtPAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + # test depth-wise + neck = CSPNeXtPAFPN( + in_channels=in_channels, + out_channels=out_channels, + use_depthwise=True) + + from mmcv.cnn.bricks import DepthwiseSeparableConvModule + self.assertTrue(neck.conv, DepthwiseSeparableConvModule) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_ppyoloe_csppan.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_ppyoloe_csppan.py new file mode 100644 index 0000000000000000000000000000000000000000..b79c1ce5bee9f0761b6c3deedc2c8c250ad8aac7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_ppyoloe_csppan.py @@ -0,0 +1,53 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models import PPYOLOECSPPAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestPPYOLOECSPPAFPN(TestCase): + + def test_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = PPYOLOECSPPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + def test_drop_block(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = PPYOLOECSPPAFPN( + in_channels=in_channels, + out_channels=out_channels, + drop_block_cfg=dict( + type='mmdet.DropBlock', + drop_prob=0.1, + block_size=3, + warm_iters=0)) + neck.train() + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov5_pafpn.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov5_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..339621ec4ba81de7c913b20dc1530289c3bd8c8c --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov5_pafpn.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.necks import YOLOv5PAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv5PAFPN(TestCase): + + def test_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv5PAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov6_pafpn.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov6_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..e766aa8700e292d13d411b3eccc4542b8ef49725 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov6_pafpn.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.necks import (YOLOv6CSPRepBiPAFPN, YOLOv6CSPRepPAFPN, + YOLOv6RepBiPAFPN, YOLOv6RepPAFPN) +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv6PAFPN(TestCase): + + def test_YOLOv6RepPAFP_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv6RepPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + def test_YOLOv6CSPRepPAFPN_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv6CSPRepPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + def test_YOLOv6CSPRepBiPAFPN_forward(self): + s = 64 + in_channels = [4, 8, 16, 32] # includes an extra input for BiFusion + feat_sizes = [s // 2**i for i in range(4)] # [64, 32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv6CSPRepBiPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) - 1 + for i in range(len(feats) - 1): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == feat_sizes[i + 1] + + def test_YOLOv6RepBiPAFPN_forward(self): + s = 64 + in_channels = [4, 8, 16, 32] # includes an extra input for BiFusion + feat_sizes = [s // 2**i for i in range(4)] # [64, 32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv6RepBiPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) - 1 + for i in range(len(feats) - 1): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == feat_sizes[i + 1] diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov7_pafpn.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov7_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..17bf455c12d6f75191813213d286ae9646ef2d14 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov7_pafpn.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmcv.cnn import ConvModule + +from mmyolo.models.necks import YOLOv7PAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv7PAFPN(TestCase): + + def test_forward(self): + # test P5 + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv7PAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] * 2 + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + # test is_tiny_version + neck = YOLOv7PAFPN( + in_channels=in_channels, + out_channels=out_channels, + is_tiny_version=True) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] * 2 + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + # test use_in_channels_in_downsample + neck = YOLOv7PAFPN( + in_channels=in_channels, + out_channels=out_channels, + use_in_channels_in_downsample=True) + for f in feats: + print(f.shape) + outs = neck(feats) + for f in outs: + print(f.shape) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] * 2 + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + # test use_repconv_outs is False + neck = YOLOv7PAFPN( + in_channels=in_channels, + out_channels=out_channels, + use_repconv_outs=False) + self.assertIsInstance(neck.out_layers[0], ConvModule) + + # test P6 + s = 64 + in_channels = [8, 16, 32, 64] + feat_sizes = [s // 2**i for i in range(4)] + out_channels = [8, 16, 32, 64] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv7PAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov8_pafpn.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov8_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..66d136d0f26f68628b29c8a585bfaf4bea0b92fd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolov8_pafpn.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models import YOLOv8PAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv8PAFPN(TestCase): + + def test_YOLOv8PAFPN_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv8PAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolox_pafpn.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolox_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..25fe67a12e969c28bfc09d66c265664c038feba5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_necks/test_yolox_pafpn.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.necks import YOLOXPAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOXPAFPN(TestCase): + + def test_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = 24 + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOXPAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_plugins/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_plugins/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_plugins/test_cbam.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_plugins/test_cbam.py new file mode 100644 index 0000000000000000000000000000000000000000..4af547c05172a2e8de09a5d56c35fa0b383dcea0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_plugins/test_cbam.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from unittest import TestCase + +import torch + +from mmyolo.models.plugins import CBAM +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestCBAM(TestCase): + + def test_forward(self): + tensor_shape = (2, 16, 20, 20) + + images = torch.randn(*tensor_shape) + cbam = CBAM(16) + out = cbam(images) + self.assertEqual(out.shape, tensor_shape) + + # test other ratio + cbam = CBAM(16, reduce_ratio=8) + out = cbam(images) + self.assertEqual(out.shape, tensor_shape) + + # test other act_cfg in ChannelAttention + cbam = CBAM(in_channels=16, act_cfg=dict(type='Sigmoid')) + out = cbam(images) + self.assertEqual(out.shape, tensor_shape) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_atss_assigner.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_atss_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..a01e4fce390965bb16a489237464c74851f09a25 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_atss_assigner.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.assigners import BatchATSSAssigner + + +class TestBatchATSSAssigner(TestCase): + + def test_batch_atss_assigner(self): + num_classes = 2 + batch_size = 2 + batch_atss_assigner = BatchATSSAssigner( + topk=3, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + num_classes=num_classes) + priors = torch.FloatTensor([ + [4., 4., 8., 8.], + [12., 4., 8., 8.], + [20., 4., 8., 8.], + [28., 4., 8., 8.], + ]).repeat(21, 1) + gt_bboxes = torch.FloatTensor([ + [0, 0, 60, 93], + [229, 0, 532, 157], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + gt_labels = torch.LongTensor([ + [0], + [11], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + num_level_bboxes = [64, 16, 4] + pad_bbox_flag = torch.FloatTensor([ + [1], + [0], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pred_bboxes = torch.FloatTensor([ + [-4., -4., 12., 12.], + [4., -4., 20., 12.], + [12., -4., 28., 12.], + [20., -4., 36., 12.], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + batch_assign_result = batch_atss_assigner.forward( + pred_bboxes, priors, num_level_bboxes, gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_labels = batch_assign_result['assigned_labels'] + assigned_bboxes = batch_assign_result['assigned_bboxes'] + assigned_scores = batch_assign_result['assigned_scores'] + fg_mask_pre_prior = batch_assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) + + def test_batch_atss_assigner_with_empty_gt(self): + """Test corner case where an image might have no true detections.""" + num_classes = 2 + batch_size = 2 + batch_atss_assigner = BatchATSSAssigner( + topk=3, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + num_classes=num_classes) + priors = torch.FloatTensor([ + [4., 4., 8., 8.], + [12., 4., 8., 8.], + [20., 4., 8., 8.], + [28., 4., 8., 8.], + ]).repeat(21, 1) + num_level_bboxes = [64, 16, 4] + pad_bbox_flag = torch.FloatTensor([ + [1], + [0], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pred_bboxes = torch.FloatTensor([ + [-4., -4., 12., 12.], + [4., -4., 20., 12.], + [12., -4., 28., 12.], + [20., -4., 36., 12.], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + + gt_bboxes = torch.zeros(batch_size, 0, 4) + gt_labels = torch.zeros(batch_size, 0, 1) + + batch_assign_result = batch_atss_assigner.forward( + pred_bboxes, priors, num_level_bboxes, gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_labels = batch_assign_result['assigned_labels'] + assigned_bboxes = batch_assign_result['assigned_bboxes'] + assigned_scores = batch_assign_result['assigned_scores'] + fg_mask_pre_prior = batch_assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) + + def test_batch_atss_assigner_with_empty_boxs(self): + """Test corner case where a network might predict no boxes.""" + num_classes = 2 + batch_size = 2 + batch_atss_assigner = BatchATSSAssigner( + topk=3, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + num_classes=num_classes) + priors = torch.zeros(84, 4) + gt_bboxes = torch.FloatTensor([ + [0, 0, 60, 93], + [229, 0, 532, 157], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + gt_labels = torch.LongTensor([ + [0], + [11], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + num_level_bboxes = [64, 16, 4] + pad_bbox_flag = torch.FloatTensor([[1], [0]]).unsqueeze(0).repeat( + batch_size, 1, 1) + pred_bboxes = torch.FloatTensor([ + [-4., -4., 12., 12.], + [4., -4., 20., 12.], + [12., -4., 28., 12.], + [20., -4., 36., 12.], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + + batch_assign_result = batch_atss_assigner.forward( + pred_bboxes, priors, num_level_bboxes, gt_labels, gt_bboxes, + pad_bbox_flag) + assigned_labels = batch_assign_result['assigned_labels'] + assigned_bboxes = batch_assign_result['assigned_bboxes'] + assigned_scores = batch_assign_result['assigned_scores'] + fg_mask_pre_prior = batch_assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) + + def test_batch_atss_assigner_with_empty_boxes_and_gt(self): + """Test corner case where a network might predict no boxes and no + gt.""" + num_classes = 2 + batch_size = 2 + batch_atss_assigner = BatchATSSAssigner( + topk=3, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + num_classes=num_classes) + priors = torch.zeros(84, 4) + gt_bboxes = torch.zeros(batch_size, 0, 4) + gt_labels = torch.zeros(batch_size, 0, 1) + num_level_bboxes = [64, 16, 4] + pad_bbox_flag = torch.zeros(batch_size, 0, 1) + pred_bboxes = torch.zeros(batch_size, 0, 4) + + batch_assign_result = batch_atss_assigner.forward( + pred_bboxes, priors, num_level_bboxes, gt_labels, gt_bboxes, + pad_bbox_flag) + assigned_labels = batch_assign_result['assigned_labels'] + assigned_bboxes = batch_assign_result['assigned_bboxes'] + assigned_scores = batch_assign_result['assigned_scores'] + fg_mask_pre_prior = batch_assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_dsl_assigner.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_dsl_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..9644896ca2b609ae161de9eb74c2a520e13b76db --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_dsl_assigner.py @@ -0,0 +1,192 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch + +from mmyolo.models.task_modules.assigners import BatchDynamicSoftLabelAssigner + + +class TestBatchDynamicSoftLabelAssigner(TestCase): + + def test_assign(self): + num_classes = 2 + batch_size = 2 + + assigner = BatchDynamicSoftLabelAssigner( + num_classes=num_classes, + soft_center_radius=3.0, + topk=1, + iou_weight=3.0) + + pred_bboxes = torch.FloatTensor([ + [23, 23, 43, 43], + [4, 5, 6, 7], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + pred_scores = torch.FloatTensor([ + [0.2], + [0.8], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + priors = torch.FloatTensor([[30, 30, 8, 8], [4, 5, 6, + 7]]).repeat(10, 1) + + gt_bboxes = torch.FloatTensor([[23, 23, 43, 43]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + gt_labels = torch.LongTensor([[0] + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pad_bbox_flag = torch.FloatTensor([[1]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_labels_weights = assign_result['assigned_labels_weights'] + assigned_bboxes = assign_result['assigned_bboxes'] + assign_metrics = assign_result['assign_metrics'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 20])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 20, + 4])) + self.assertEqual(assigned_labels_weights.shape, + torch.Size([batch_size, 20])) + self.assertEqual(assign_metrics.shape, torch.Size([batch_size, 20])) + + def test_assign_with_empty_gt(self): + num_classes = 2 + batch_size = 2 + + assigner = BatchDynamicSoftLabelAssigner( + num_classes=num_classes, + soft_center_radius=3.0, + topk=1, + iou_weight=3.0) + + pred_bboxes = torch.FloatTensor([ + [23, 23, 43, 43], + [4, 5, 6, 7], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + pred_scores = torch.FloatTensor([ + [0.2], + [0.8], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + priors = torch.FloatTensor([[30, 30, 8, 8], [4, 5, 6, + 7]]).repeat(10, 1) + + gt_bboxes = torch.zeros(batch_size, 0, 4) + gt_labels = torch.zeros(batch_size, 0, 1) + pad_bbox_flag = torch.zeros(batch_size, 0, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_labels_weights = assign_result['assigned_labels_weights'] + assigned_bboxes = assign_result['assigned_bboxes'] + assign_metrics = assign_result['assign_metrics'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 20])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 20, + 4])) + self.assertEqual(assigned_labels_weights.shape, + torch.Size([batch_size, 20])) + self.assertEqual(assign_metrics.shape, torch.Size([batch_size, 20])) + + def test_assign_with_empty_boxs(self): + num_classes = 2 + batch_size = 2 + + assigner = BatchDynamicSoftLabelAssigner( + num_classes=num_classes, + soft_center_radius=3.0, + topk=1, + iou_weight=3.0) + + pred_bboxes = torch.zeros(batch_size, 0, 4) + + pred_scores = torch.zeros(batch_size, 0, 4) + + priors = torch.zeros(0, 4) + gt_bboxes = torch.FloatTensor([[23, 23, 43, 43]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + gt_labels = torch.LongTensor([[0] + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pad_bbox_flag = torch.FloatTensor([[1]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_labels_weights = assign_result['assigned_labels_weights'] + assigned_bboxes = assign_result['assigned_bboxes'] + assign_metrics = assign_result['assign_metrics'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 0])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 0, 4])) + self.assertEqual(assigned_labels_weights.shape, + torch.Size([batch_size, 0])) + self.assertEqual(assign_metrics.shape, torch.Size([batch_size, 0])) + + def test_assign_rotate_box(self): + try: + import importlib + importlib.import_module('mmrotate') + except ImportError: + pytest.skip('mmrotate is not installed.', allow_module_level=True) + + num_classes = 2 + batch_size = 2 + + assigner = BatchDynamicSoftLabelAssigner( + num_classes=num_classes, + soft_center_radius=3.0, + topk=1, + iou_weight=3.0, + iou_calculator=dict(type='mmrotate.RBboxOverlaps2D'), + # RBboxOverlaps2D doesn't support batch input, use loop instead. + batch_iou=False, + ) + + pred_bboxes = torch.FloatTensor([ + [23, 23, 20, 20, 0.078], + [4, 5, 2, 2, 0.078], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + pred_scores = torch.FloatTensor([ + [0.2], + [0.8], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + priors = torch.FloatTensor([[30, 30, 8, 8], [4, 5, 6, + 7]]).repeat(10, 1) + + gt_bboxes = torch.FloatTensor([[23, 23, 20, 20, + 0.078]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + gt_labels = torch.LongTensor([[0] + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pad_bbox_flag = torch.FloatTensor([[1]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_labels_weights = assign_result['assigned_labels_weights'] + assigned_bboxes = assign_result['assigned_bboxes'] + assign_metrics = assign_result['assign_metrics'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 20])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 20, + 5])) + self.assertEqual(assigned_labels_weights.shape, + torch.Size([batch_size, 20])) + self.assertEqual(assign_metrics.shape, torch.Size([batch_size, 20])) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_task_aligned_assigner.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_task_aligned_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..fe474b53122703af556ff11a3ef42fa0a3ced736 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_task_aligned_assigner.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.assigners import BatchTaskAlignedAssigner + + +class TestBatchTaskAlignedAssigner(TestCase): + + def test_batch_task_aligned_assigner(self): + batch_size = 2 + num_classes = 4 + assigner = BatchTaskAlignedAssigner( + num_classes=num_classes, alpha=1, beta=6, topk=13, eps=1e-9) + pred_scores = torch.FloatTensor([ + [0.1, 0.2], + [0.2, 0.3], + [0.3, 0.4], + [0.4, 0.5], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + priors = torch.FloatTensor([ + [0, 0, 4., 4.], + [0, 0, 12., 4.], + [0, 0, 20., 4.], + [0, 0, 28., 4.], + ]).repeat(21, 1) + gt_bboxes = torch.FloatTensor([ + [0, 0, 60, 93], + [229, 0, 532, 157], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + gt_labels = torch.LongTensor([[0], [1] + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pad_bbox_flag = torch.FloatTensor([[1], [0]]).unsqueeze(0).repeat( + batch_size, 1, 1) + pred_bboxes = torch.FloatTensor([ + [-4., -4., 12., 12.], + [4., -4., 20., 12.], + [12., -4., 28., 12.], + [20., -4., 36., 12.], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_bboxes = assign_result['assigned_bboxes'] + assigned_scores = assign_result['assigned_scores'] + fg_mask_pre_prior = assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_pose_sim_ota_assigner.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_pose_sim_ota_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..fb4793f7e4ed0066545e821352f0a5e263d3b9fd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_pose_sim_ota_assigner.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine.structures import InstanceData +from mmengine.testing import assert_allclose + +from mmyolo.models.task_modules.assigners import PoseSimOTAAssigner + + +class TestPoseSimOTAAssigner(TestCase): + + def test_assign(self): + assigner = PoseSimOTAAssigner( + center_radius=2.5, + candidate_topk=1, + iou_weight=3.0, + cls_weight=1.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')) + pred_instances = InstanceData( + bboxes=torch.Tensor([[23, 23, 43, 43] + [1] * 51, + [4, 5, 6, 7] + [1] * 51]), + scores=torch.FloatTensor([[0.2], [0.8]]), + priors=torch.Tensor([[30, 30, 8, 8], [4, 5, 6, 7]])) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23, 23, 43, 43]]), + labels=torch.LongTensor([0]), + keypoints_visible=torch.Tensor([[ + 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., + 0. + ]]), + keypoints=torch.Tensor([[[30, 30], [30, 30], [30, 30], [30, 30], + [30, 30], [30, 30], [30, 30], [30, 30], + [30, 30], [30, 30], [30, 30], [30, 30], + [30, 30], [30, 30], [30, 30], [30, 30], + [30, 30]]])) + assign_result = assigner.assign( + pred_instances=pred_instances, gt_instances=gt_instances) + + expected_gt_inds = torch.LongTensor([1, 0]) + assert_allclose(assign_result.gt_inds, expected_gt_inds) + + def test_assign_with_no_valid_bboxes(self): + assigner = PoseSimOTAAssigner( + center_radius=2.5, + candidate_topk=1, + iou_weight=3.0, + cls_weight=1.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')) + pred_instances = InstanceData( + bboxes=torch.Tensor([[123, 123, 143, 143], [114, 151, 161, 171]]), + scores=torch.FloatTensor([[0.2], [0.8]]), + priors=torch.Tensor([[30, 30, 8, 8], [55, 55, 8, 8]])) + gt_instances = InstanceData( + bboxes=torch.Tensor([[0, 0, 1, 1]]), + labels=torch.LongTensor([0]), + keypoints_visible=torch.zeros((1, 17)), + keypoints=torch.zeros((1, 17, 2))) + assign_result = assigner.assign( + pred_instances=pred_instances, gt_instances=gt_instances) + + expected_gt_inds = torch.LongTensor([0, 0]) + assert_allclose(assign_result.gt_inds, expected_gt_inds) + + def test_assign_with_empty_gt(self): + assigner = PoseSimOTAAssigner( + center_radius=2.5, + candidate_topk=1, + iou_weight=3.0, + cls_weight=1.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')) + pred_instances = InstanceData( + bboxes=torch.Tensor([[[30, 40, 50, 60]], [[4, 5, 6, 7]]]), + scores=torch.FloatTensor([[0.2], [0.8]]), + priors=torch.Tensor([[0, 12, 23, 34], [4, 5, 6, 7]])) + gt_instances = InstanceData( + bboxes=torch.empty(0, 4), + labels=torch.empty(0), + keypoints_visible=torch.empty(0, 17), + keypoints=torch.empty(0, 17, 2)) + + assign_result = assigner.assign( + pred_instances=pred_instances, gt_instances=gt_instances) + expected_gt_inds = torch.LongTensor([0, 0]) + assert_allclose(assign_result.gt_inds, expected_gt_inds) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_distance_point_bbox_coder.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_distance_point_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..10b0215c27d7a1f88f894f459cf641555833da9e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_distance_point_bbox_coder.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.coders import DistancePointBBoxCoder + + +class TestDistancePointBBoxCoder(TestCase): + + def test_decoder(self): + coder = DistancePointBBoxCoder() + + points = torch.Tensor([[74., 61.], [-29., 106.], [138., 61.], + [29., 170.]]) + pred_bboxes = torch.Tensor([[0, -1, 3, 3], [-1, -7, -4.8, 9], + [-23, -1, 12, 1], [14.5, -13, 10, 18.3]]) + expected_distance = torch.Tensor([[74, 63, 80, 67], + [-25, 134, -48.2, 142], + [276, 67, 210, 67], + [-58, 248, 89, 279.8]]) + strides = torch.Tensor([2, 4, 6, 6]) + out_distance = coder.decode(points, pred_bboxes, strides) + assert expected_distance.allclose(out_distance) + + batch_priors = points.unsqueeze(0).repeat(2, 1, 1) + batch_pred_bboxes = pred_bboxes.unsqueeze(0).repeat(2, 1, 1) + batch_out = coder.decode(batch_priors, batch_pred_bboxes, strides)[0] + assert out_distance.allclose(batch_out) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolov5_bbox_coder.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolov5_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..e1d4ebe1fd9dc5263b09e8d07a456a41e61bbc3b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolov5_bbox_coder.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.coders import YOLOv5BBoxCoder + + +class TestYOLOv5Coder(TestCase): + + def test_decoder(self): + coder = YOLOv5BBoxCoder() + + priors = torch.Tensor([[10., 10., 20., 20.], [10., 8., 10., 10.], + [15., 8., 20., 3.], [2., 5., 5., 8.]]) + pred_bboxes = torch.Tensor([[0.0000, 0.0000, 1.0000, 1.0000], + [0.1409, 0.1409, 2.8591, 2.8591], + [0.0000, 0.3161, 4.1945, 0.6839], + [1.0000, 5.0000, 9.0000, 5.0000]]) + strides = torch.Tensor([2, 4, 8, 8]) + expected_decode_bboxes = torch.Tensor( + [[4.3111, 4.3111, 25.6889, 25.6889], + [10.2813, 5.7033, 10.2813, 12.8594], + [7.7949, 11.1710, 27.2051, 2.3369], + [1.1984, 8.4730, 13.1955, 20.3129]]) + out = coder.decode(priors, pred_bboxes, strides) + assert expected_decode_bboxes.allclose(out, atol=1e-04) + + batch_priors = priors.unsqueeze(0).repeat(2, 1, 1) + batch_pred_bboxes = pred_bboxes.unsqueeze(0).repeat(2, 1, 1) + batch_out = coder.decode(batch_priors, batch_pred_bboxes, strides)[0] + assert out.allclose(batch_out) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolox_bbox_coder.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolox_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..00d6c3164b840ad05fe112ff629ad74faffb2418 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolox_bbox_coder.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.coders import YOLOXBBoxCoder + + +class TestYOLOv5Coder(TestCase): + + def test_decoder(self): + coder = YOLOXBBoxCoder() + + priors = torch.Tensor([[10., 10.], [8., 8.], [15., 8.], [2., 5.]]) + pred_bboxes = torch.Tensor([[0.0000, 0.0000, 1.0000, 1.0000], + [0.0409, 0.1409, 0.8591, 0.8591], + [0.0000, 0.3161, 0.1945, 0.6839], + [1.0000, 5.0000, 0.2000, 0.6000]]) + strides = torch.Tensor([2, 4, 6, 6]) + expected_decode_bboxes = torch.Tensor( + [[7.2817, 7.2817, 12.7183, 12.7183], + [3.4415, 3.8415, 12.8857, 13.2857], + [11.3559, 3.9518, 18.6441, 15.8414], + [4.3358, 29.5336, 11.6642, 40.4664]]) + out = coder.decode(priors, pred_bboxes, strides) + assert expected_decode_bboxes.allclose(out, atol=1e-04) + + batch_priors = priors.unsqueeze(0).repeat(2, 1, 1) + batch_pred_bboxes = pred_bboxes.unsqueeze(0).repeat(2, 1, 1) + batch_out = coder.decode(batch_priors, batch_pred_bboxes, strides)[0] + assert out.allclose(batch_out) diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_utils/__init__.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_utils/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_models/test_utils/test_misc.py b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_utils/test_misc.py new file mode 100644 index 0000000000000000000000000000000000000000..dce9502571e4294757ac6f2b9bb524e35c372c29 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_models/test_utils/test_misc.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.models.utils import gt_instances_preprocess +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestGtInstancesPreprocess: + + @pytest.mark.parametrize('box_dim', [4, 5]) + def test(self, box_dim): + gt_instances = InstanceData( + bboxes=torch.empty((0, box_dim)), labels=torch.LongTensor([])) + batch_size = 1 + batch_instance = gt_instances_preprocess([gt_instances], batch_size) + assert isinstance(batch_instance, Tensor) + assert len(batch_instance.shape) == 3, 'the len of result must be 3.' + assert batch_instance.size(-1) == box_dim + 1 + + @pytest.mark.parametrize('box_dim', [4, 5]) + def test_fast_version(self, box_dim: int): + gt_instances = torch.from_numpy( + np.array([[0., 1., *(0., ) * box_dim]], dtype=np.float32)) + batch_size = 1 + batch_instance = gt_instances_preprocess(gt_instances, batch_size) + assert isinstance(batch_instance, Tensor) + assert len(batch_instance.shape) == 3, 'the len of result must be 3.' + assert batch_instance.shape[1] == 1 + assert batch_instance.shape[2] == box_dim + 1 diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_utils/test_collect_env.py b/models/YOLO-World/third_party/mmyolo/tests/test_utils/test_collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..913f46fa3c9286e9c3cbd656ad5e93def143aea0 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_utils/test_collect_env.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import sys +from unittest import TestCase + +import mmcv +import mmdet +import mmengine + +from mmyolo.utils import collect_env + + +class TestCollectEnv(TestCase): + + def test_collect_env(self): + env_info = collect_env() + print(env_info) + expected_keys = [ + 'sys.platform', 'Python', 'CUDA available', 'PyTorch', + 'PyTorch compiling details', 'OpenCV', 'MMEngine', 'GCC' + ] + for key in expected_keys: + assert key in env_info + + if env_info['CUDA available']: + for key in ['CUDA_HOME', 'NVCC']: + assert key in env_info + + assert env_info['sys.platform'] == sys.platform + assert env_info['Python'] == sys.version.replace('\n', '') + + assert env_info['MMEngine'] == mmengine.__version__ + assert env_info['MMCV'] == mmcv.__version__ + assert env_info['MMDetection'] == mmdet.__version__ diff --git a/models/YOLO-World/third_party/mmyolo/tests/test_utils/test_setup_env.py b/models/YOLO-World/third_party/mmyolo/tests/test_utils/test_setup_env.py new file mode 100644 index 0000000000000000000000000000000000000000..e6bd6890b31bbe9179553bd440cc0e8bc44329c2 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tests/test_utils/test_setup_env.py @@ -0,0 +1,39 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import sys +from unittest import TestCase + +from mmengine import DefaultScope + +from mmyolo.utils import register_all_modules + + +class TestSetupEnv(TestCase): + + def test_register_all_modules(self): + from mmyolo.registry import DATASETS + + # not init default scope + sys.modules.pop('mmyolo.datasets', None) + sys.modules.pop('mmyolo.datasets.yolov5_coco', None) + DATASETS._module_dict.pop('YOLOv5CocoDataset', None) + self.assertFalse('YOLOv5CocoDataset' in DATASETS.module_dict) + register_all_modules(init_default_scope=False) + self.assertTrue('YOLOv5CocoDataset' in DATASETS.module_dict) + + # init default scope + sys.modules.pop('mmyolo.datasets', None) + sys.modules.pop('mmyolo.datasets.yolov5_coco', None) + DATASETS._module_dict.pop('YOLOv5CocoDataset', None) + self.assertFalse('YOLOv5CocoDataset' in DATASETS.module_dict) + register_all_modules(init_default_scope=True) + self.assertTrue('YOLOv5CocoDataset' in DATASETS.module_dict) + self.assertEqual(DefaultScope.get_current_instance().scope_name, + 'mmyolo') + + # init default scope when another scope is init + name = f'test-{datetime.datetime.now()}' + DefaultScope.get_instance(name, scope_name='test') + with self.assertWarnsRegex( + Warning, 'The current default scope "test" is not "mmyolo"'): + register_all_modules(init_default_scope=True) diff --git a/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/benchmark.py b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..29f53a4768c3339d656d4bb71dae3396e5501265 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/benchmark.py @@ -0,0 +1,188 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import copy +import os +import time + +import torch +from mmengine import Config, DictAction +from mmengine.dist import get_world_size, init_dist +from mmengine.logging import MMLogger, print_log +from mmengine.registry import init_default_scope +from mmengine.runner import Runner, load_checkpoint +from mmengine.utils import mkdir_or_exist +from mmengine.utils.dl_utils import set_multi_processing + +from mmyolo.registry import MODELS + + +# TODO: Refactoring and improving +def parse_args(): + parser = argparse.ArgumentParser(description='MMYOLO benchmark a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--repeat-num', + type=int, + default=1, + help='number of repeat times of measurement for averaging the results') + parser.add_argument( + '--max-iter', type=int, default=2000, help='num of max iter') + parser.add_argument( + '--log-interval', type=int, default=50, help='interval of logging') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing ' + 'benchmark metrics') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def measure_inference_speed(cfg, checkpoint, max_iter, log_interval, + is_fuse_conv_bn): + env_cfg = cfg.get('env_cfg') + if env_cfg.get('cudnn_benchmark'): + torch.backends.cudnn.benchmark = True + + mp_cfg: dict = env_cfg.get('mp_cfg', {}) + set_multi_processing(**mp_cfg, distributed=cfg.distributed) + + # Because multiple processes will occupy additional CPU resources, + # FPS statistics will be more unstable when num_workers is not 0. + # It is reasonable to set num_workers to 0. + dataloader_cfg = cfg.test_dataloader + dataloader_cfg['num_workers'] = 0 + dataloader_cfg['batch_size'] = 1 + dataloader_cfg['persistent_workers'] = False + data_loader = Runner.build_dataloader(dataloader_cfg) + + # build the model and load checkpoint + model = MODELS.build(cfg.model) + load_checkpoint(model, checkpoint, map_location='cpu') + model = model.cuda() + model.eval() + + # the first several iterations may be very slow so skip them + num_warmup = 5 + pure_inf_time = 0 + fps = 0 + + # benchmark with 2000 image and take the average + for i, data in enumerate(data_loader): + + torch.cuda.synchronize() + start_time = time.perf_counter() + + with torch.no_grad(): + model.test_step(data) + + torch.cuda.synchronize() + elapsed = time.perf_counter() - start_time + + if i >= num_warmup: + pure_inf_time += elapsed + if (i + 1) % log_interval == 0: + fps = (i + 1 - num_warmup) / pure_inf_time + print_log( + f'Done image [{i + 1:<3}/ {max_iter}], ' + f'fps: {fps:.1f} img / s, ' + f'times per image: {1000 / fps:.1f} ms / img', 'current') + + if (i + 1) == max_iter: + fps = (i + 1 - num_warmup) / pure_inf_time + print_log( + f'Overall fps: {fps:.1f} img / s, ' + f'times per image: {1000 / fps:.1f} ms / img', 'current') + break + return fps + + +def repeat_measure_inference_speed(cfg, + checkpoint, + max_iter, + log_interval, + is_fuse_conv_bn, + repeat_num=1): + assert repeat_num >= 1 + + fps_list = [] + + for _ in range(repeat_num): + cp_cfg = copy.deepcopy(cfg) + + fps_list.append( + measure_inference_speed(cp_cfg, checkpoint, max_iter, log_interval, + is_fuse_conv_bn)) + + if repeat_num > 1: + fps_list_ = [round(fps, 1) for fps in fps_list] + times_pre_image_list_ = [round(1000 / fps, 1) for fps in fps_list] + mean_fps_ = sum(fps_list_) / len(fps_list_) + mean_times_pre_image_ = sum(times_pre_image_list_) / len( + times_pre_image_list_) + print_log( + f'Overall fps: {fps_list_}[{mean_fps_:.1f}] img / s, ' + f'times per image: ' + f'{times_pre_image_list_}[{mean_times_pre_image_:.1f}] ms / img', + 'current') + return fps_list + + return fps_list[0] + + +# TODO: refactoring +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + distributed = False + if args.launcher != 'none': + init_dist(args.launcher, **cfg.get('env_cfg', {}).get('dist_cfg', {})) + distributed = True + assert get_world_size( + ) == 1, 'Inference benchmark does not allow distributed multi-GPU' + + cfg.distributed = distributed + + log_file = None + if args.work_dir: + log_file = os.path.join(args.work_dir, 'benchmark.log') + mkdir_or_exist(args.work_dir) + + MMLogger.get_instance('mmyolo', log_file=log_file, log_level='INFO') + + repeat_measure_inference_speed(cfg, args.checkpoint, args.max_iter, + args.log_interval, args.fuse_conv_bn, + args.repeat_num) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/browse_coco_json.py b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/browse_coco_json.py new file mode 100644 index 0000000000000000000000000000000000000000..71a2fc2a942d234e1ce2e3e93901a66bacb123df --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/browse_coco_json.py @@ -0,0 +1,147 @@ +import argparse +import os.path as osp + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon +from pycocotools.coco import COCO + + +def show_coco_json(args): + if args.data_root is not None: + coco = COCO(osp.join(args.data_root, args.ann_file)) + else: + coco = COCO(args.ann_file) + print(f'Total number of images:{len(coco.getImgIds())}') + categories = coco.loadCats(coco.getCatIds()) + category_names = [category['name'] for category in categories] + print(f'Total number of Categories : {len(category_names)}') + print('Categories: \n{}\n'.format(' '.join(category_names))) + + if args.category_names is None: + category_ids = [] + else: + assert set(category_names) > set(args.category_names) + category_ids = coco.getCatIds(args.category_names) + + image_ids = coco.getImgIds(catIds=category_ids) + + if args.shuffle: + np.random.shuffle(image_ids) + + for i in range(len(image_ids)): + image_data = coco.loadImgs(image_ids[i])[0] + if args.data_root is not None: + image_path = osp.join(args.data_root, args.img_dir, + image_data['file_name']) + else: + image_path = osp.join(args.img_dir, image_data['file_name']) + + annotation_ids = coco.getAnnIds( + imgIds=image_data['id'], catIds=category_ids, iscrowd=0) + annotations = coco.loadAnns(annotation_ids) + + image = cv2.imread(image_path) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + plt.figure() + plt.imshow(image) + + if args.disp_all: + coco.showAnns(annotations) + else: + show_bbox_only(coco, annotations) + + if args.wait_time == 0: + plt.show() + else: + plt.show(block=False) + plt.pause(args.wait_time) + + plt.close() + + +def show_bbox_only(coco, anns, show_label_bbox=True, is_filling=True): + """Show bounding box of annotations Only.""" + if len(anns) == 0: + return + + ax = plt.gca() + ax.set_autoscale_on(False) + + image2color = dict() + for cat in coco.getCatIds(): + image2color[cat] = (np.random.random((1, 3)) * 0.7 + 0.3).tolist()[0] + + polygons = [] + colors = [] + + for ann in anns: + color = image2color[ann['category_id']] + bbox_x, bbox_y, bbox_w, bbox_h = ann['bbox'] + poly = [[bbox_x, bbox_y], [bbox_x, bbox_y + bbox_h], + [bbox_x + bbox_w, bbox_y + bbox_h], [bbox_x + bbox_w, bbox_y]] + polygons.append(Polygon(np.array(poly).reshape((4, 2)))) + colors.append(color) + + if show_label_bbox: + label_bbox = dict(facecolor=color) + else: + label_bbox = None + + ax.text( + bbox_x, + bbox_y, + '%s' % (coco.loadCats(ann['category_id'])[0]['name']), + color='white', + bbox=label_bbox) + + if is_filling: + p = PatchCollection( + polygons, facecolor=colors, linewidths=0, alpha=0.4) + ax.add_collection(p) + p = PatchCollection( + polygons, facecolor='none', edgecolors=colors, linewidths=2) + ax.add_collection(p) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Show coco json file') + parser.add_argument('--data-root', default=None, help='dataset root') + parser.add_argument( + '--img-dir', default='data/coco/train2017', help='image folder path') + parser.add_argument( + '--ann-file', + default='data/coco/annotations/instances_train2017.json', + help='ann file path') + parser.add_argument( + '--wait-time', type=float, default=2, help='the interval of show (s)') + parser.add_argument( + '--disp-all', + action='store_true', + help='Whether to display all types of data, ' + 'such as bbox and mask.' + ' Default is to display only bbox') + parser.add_argument( + '--category-names', + type=str, + default=None, + nargs='+', + help='Display category-specific data, e.g., "bicycle", "person"') + parser.add_argument( + '--shuffle', + action='store_true', + help='Whether to display in disorder') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + show_coco_json(args) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/browse_dataset.py b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/browse_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..21a1d709d3ced0e5f865748afa0a1e258a8751f9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/browse_dataset.py @@ -0,0 +1,276 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +import sys +from typing import Tuple + +import cv2 +import mmcv +import numpy as np +from mmdet.models.utils import mask2ndarray +from mmdet.structures.bbox import BaseBoxes +from mmengine.config import Config, DictAction +from mmengine.dataset import Compose +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar +from mmengine.visualization import Visualizer + +from mmyolo.registry import DATASETS, VISUALIZERS + + +# TODO: Support for printing the change in key of results +# TODO: Some bug. If you meet some bug, please use the original +def parse_args(): + parser = argparse.ArgumentParser(description='Browse a dataset') + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--phase', + '-p', + default='train', + type=str, + choices=['train', 'test', 'val'], + help='phase of dataset to visualize, accept "train" "test" and "val".' + ' Defaults to "train".') + parser.add_argument( + '--mode', + '-m', + default='transformed', + type=str, + choices=['original', 'transformed', 'pipeline'], + help='display mode; display original pictures or ' + 'transformed pictures or comparison pictures. "original" ' + 'means show images load from disk; "transformed" means ' + 'to show images after transformed; "pipeline" means show all ' + 'the intermediate images. Defaults to "transformed".') + parser.add_argument( + '--out-dir', + default='output', + type=str, + help='If there is no display interface, you can save it.') + parser.add_argument('--not-show', default=False, action='store_true') + parser.add_argument( + '--show-number', + '-n', + type=int, + default=sys.maxsize, + help='number of images selected to visualize, ' + 'must bigger than 0. if the number is bigger than length ' + 'of dataset, show all the images in dataset; ' + 'default "sys.maxsize", show all images in dataset') + parser.add_argument( + '--show-interval', + '-i', + type=float, + default=3, + help='the interval of show (s)') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def _get_adaptive_scale(img_shape: Tuple[int, int], + min_scale: float = 0.3, + max_scale: float = 3.0) -> float: + """Get adaptive scale according to image shape. + + The target scale depends on the the short edge length of the image. If the + short edge length equals 224, the output is 1.0. And output linear + scales according the short edge length. You can also specify the minimum + scale and the maximum scale to limit the linear scale. + + Args: + img_shape (Tuple[int, int]): The shape of the canvas image. + min_scale (int): The minimum scale. Defaults to 0.3. + max_scale (int): The maximum scale. Defaults to 3.0. + Returns: + int: The adaptive scale. + """ + short_edge_length = min(img_shape) + scale = short_edge_length / 224. + return min(max(scale, min_scale), max_scale) + + +def make_grid(imgs, names): + """Concat list of pictures into a single big picture, align height here.""" + visualizer = Visualizer.get_current_instance() + ori_shapes = [img.shape[:2] for img in imgs] + max_height = int(max(img.shape[0] for img in imgs) * 1.1) + min_width = min(img.shape[1] for img in imgs) + horizontal_gap = min_width // 10 + img_scale = _get_adaptive_scale((max_height, min_width)) + + texts = [] + text_positions = [] + start_x = 0 + for i, img in enumerate(imgs): + pad_height = (max_height - img.shape[0]) // 2 + pad_width = horizontal_gap // 2 + # make border + imgs[i] = cv2.copyMakeBorder( + img, + pad_height, + max_height - img.shape[0] - pad_height + int(img_scale * 30 * 2), + pad_width, + pad_width, + cv2.BORDER_CONSTANT, + value=(255, 255, 255)) + texts.append(f'{"execution: "}{i}\n{names[i]}\n{ori_shapes[i]}') + text_positions.append( + [start_x + img.shape[1] // 2 + pad_width, max_height]) + start_x += img.shape[1] + horizontal_gap + + display_img = np.concatenate(imgs, axis=1) + visualizer.set_image(display_img) + img_scale = _get_adaptive_scale(display_img.shape[:2]) + visualizer.draw_texts( + texts, + positions=np.array(text_positions), + font_sizes=img_scale * 7, + colors='black', + horizontal_alignments='center', + font_families='monospace') + return visualizer.get_image() + + +def swap_pipeline_position(dataset_cfg): + load_ann_tfm_name = 'LoadAnnotations' + pipeline = dataset_cfg.get('pipeline') + if (pipeline is None): + return dataset_cfg + all_transform_types = [tfm['type'] for tfm in pipeline] + if load_ann_tfm_name in all_transform_types: + load_ann_tfm_index = all_transform_types.index(load_ann_tfm_name) + load_ann_tfm = pipeline.pop(load_ann_tfm_index) + pipeline.insert(1, load_ann_tfm) + + +class InspectCompose(Compose): + """Compose multiple transforms sequentially. + + And record "img" field of all results in one list. + """ + + def __init__(self, transforms, intermediate_imgs): + super().__init__(transforms=transforms) + self.intermediate_imgs = intermediate_imgs + + def __call__(self, data): + if 'img' in data: + self.intermediate_imgs.append({ + 'name': 'original', + 'img': data['img'].copy() + }) + self.ptransforms = [ + self.transforms[i] for i in range(len(self.transforms) - 1) + ] + for t in self.ptransforms: + data = t(data) + # Keep the same meta_keys in the PackDetInputs + self.transforms[-1].meta_keys = [key for key in data] + data_sample = self.transforms[-1](data) + if data is None: + return None + if 'img' in data: + self.intermediate_imgs.append({ + 'name': + t.__class__.__name__, + 'dataset_sample': + data_sample['data_samples'] + }) + return data + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + dataset_cfg = cfg.get(args.phase + '_dataloader').get('dataset') + if (args.phase in ['test', 'val']): + swap_pipeline_position(dataset_cfg) + dataset = DATASETS.build(dataset_cfg) + visualizer = VISUALIZERS.build(cfg.visualizer) + visualizer.dataset_meta = dataset.metainfo + + intermediate_imgs = [] + + if not hasattr(dataset, 'pipeline'): + # for dataset_wrapper + dataset = dataset.dataset + + # TODO: The dataset wrapper occasion is not considered here + dataset.pipeline = InspectCompose(dataset.pipeline.transforms, + intermediate_imgs) + + # init visualization image number + assert args.show_number > 0 + display_number = min(args.show_number, len(dataset)) + + progress_bar = ProgressBar(display_number) + for i, item in zip(range(display_number), dataset): + image_i = [] + result_i = [result['dataset_sample'] for result in intermediate_imgs] + for k, datasample in enumerate(result_i): + image = datasample.img + gt_instances = datasample.gt_instances + image = image[..., [2, 1, 0]] # bgr to rgb + gt_bboxes = gt_instances.get('bboxes', None) + if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes): + gt_instances.bboxes = gt_bboxes.tensor + gt_masks = gt_instances.get('masks', None) + if gt_masks is not None: + masks = mask2ndarray(gt_masks) + gt_instances.masks = masks.astype(bool) + datasample.gt_instances = gt_instances + # get filename from dataset or just use index as filename + visualizer.add_datasample( + 'result', + image, + datasample, + draw_pred=False, + draw_gt=True, + show=False) + image_show = visualizer.get_image() + image_i.append(image_show) + + if args.mode == 'original': + image = image_i[0] + elif args.mode == 'transformed': + image = image_i[-1] + else: + image = make_grid([result for result in image_i], + [result['name'] for result in intermediate_imgs]) + + if hasattr(datasample, 'img_path'): + filename = osp.basename(datasample.img_path) + else: + # some dataset have not image path + filename = f'{i}.jpg' + out_file = osp.join(args.out_dir, + filename) if args.out_dir is not None else None + + if out_file is not None: + mmcv.imwrite(image[..., ::-1], out_file) + + if not args.not_show: + visualizer.show( + image, win_name=filename, wait_time=args.show_interval) + + intermediate_imgs.clear() + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/browse_dataset_simple.py b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/browse_dataset_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..ebacbde3a5a2e1212089e4d4038fa286d462071b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/browse_dataset_simple.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp + +from mmdet.models.utils import mask2ndarray +from mmdet.structures.bbox import BaseBoxes +from mmengine.config import Config, DictAction +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar + +from mmyolo.registry import DATASETS, VISUALIZERS + + +def parse_args(): + parser = argparse.ArgumentParser(description='Browse a dataset') + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--output-dir', + default=None, + type=str, + help='If there is no display interface, you can save it') + parser.add_argument('--not-show', default=False, action='store_true') + parser.add_argument( + '--show-interval', + type=float, + default=0, + help='the interval of show (s)') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # register all modules in mmdet into the registries + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + dataset = DATASETS.build(cfg.train_dataloader.dataset) + visualizer = VISUALIZERS.build(cfg.visualizer) + visualizer.dataset_meta = dataset.metainfo + + progress_bar = ProgressBar(len(dataset)) + for item in dataset: + img = item['inputs'].permute(1, 2, 0).numpy() + data_sample = item['data_samples'].numpy() + gt_instances = data_sample.gt_instances + img_path = osp.basename(item['data_samples'].img_path) + + out_file = osp.join( + args.output_dir, + osp.basename(img_path)) if args.output_dir is not None else None + + img = img[..., [2, 1, 0]] # bgr to rgb + gt_bboxes = gt_instances.get('bboxes', None) + if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes): + gt_instances.bboxes = gt_bboxes.tensor + gt_masks = gt_instances.get('masks', None) + if gt_masks is not None: + masks = mask2ndarray(gt_masks) + gt_instances.masks = masks.astype(bool) + data_sample.gt_instances = gt_instances + + visualizer.add_datasample( + osp.basename(img_path), + img, + data_sample, + draw_pred=False, + show=not args.not_show, + wait_time=args.show_interval, + out_file=out_file) + + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/confusion_matrix.py b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/confusion_matrix.py new file mode 100644 index 0000000000000000000000000000000000000000..f48abdb90eadba3d50bec106c2ad0ea7709e897d --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/confusion_matrix.py @@ -0,0 +1,273 @@ +import argparse +import os + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.ticker import MultipleLocator +from mmcv.ops import nms +from mmdet.evaluation import bbox_overlaps +from mmdet.utils import replace_cfg_vals, update_data_root +from mmengine import Config, DictAction +from mmengine.fileio import load +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar + +from mmyolo.registry import DATASETS + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Generate confusion matrix from detection results') + parser.add_argument('config', help='test config file path') + parser.add_argument( + 'prediction_path', help='prediction path where test .pkl result') + parser.add_argument( + 'save_dir', help='directory where confusion matrix will be saved') + parser.add_argument( + '--show', action='store_true', help='show confusion matrix') + parser.add_argument( + '--color-theme', + default='plasma', + help='theme of the matrix color map') + parser.add_argument( + '--score-thr', + type=float, + default=0.3, + help='score threshold to filter detection bboxes') + parser.add_argument( + '--tp-iou-thr', + type=float, + default=0.5, + help='IoU threshold to be considered as matched') + parser.add_argument( + '--nms-iou-thr', + type=float, + default=None, + help='nms IoU threshold, only applied when users want to change the' + 'nms IoU threshold.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def calculate_confusion_matrix(dataset, + results, + score_thr=0, + nms_iou_thr=None, + tp_iou_thr=0.5): + """Calculate the confusion matrix. + + Args: + dataset (Dataset): Test or val dataset. + results (list[ndarray]): A list of detection results in each image. + score_thr (float|optional): Score threshold to filter bboxes. + Default: 0. + nms_iou_thr (float|optional): nms IoU threshold, the detection results + have done nms in the detector, only applied when users want to + change the nms IoU threshold. Default: None. + tp_iou_thr (float|optional): IoU threshold to be considered as matched. + Default: 0.5. + """ + num_classes = len(dataset.metainfo['classes']) + confusion_matrix = np.zeros(shape=[num_classes + 1, num_classes + 1]) + assert len(dataset) == len(results) + prog_bar = ProgressBar(len(results)) + for idx, per_img_res in enumerate(results): + res_bboxes = per_img_res['pred_instances'] + gts = dataset.get_data_info(idx)['instances'] + analyze_per_img_dets(confusion_matrix, gts, res_bboxes, score_thr, + tp_iou_thr, nms_iou_thr) + prog_bar.update() + return confusion_matrix + + +def analyze_per_img_dets(confusion_matrix, + gts, + result, + score_thr=0, + tp_iou_thr=0.5, + nms_iou_thr=None): + """Analyze detection results on each image. + + Args: + confusion_matrix (ndarray): The confusion matrix, + has shape (num_classes + 1, num_classes + 1). + gt_bboxes (ndarray): Ground truth bboxes, has shape (num_gt, 4). + gt_labels (ndarray): Ground truth labels, has shape (num_gt). + result (ndarray): Detection results, has shape + (num_classes, num_bboxes, 5). + score_thr (float): Score threshold to filter bboxes. + Default: 0. + tp_iou_thr (float): IoU threshold to be considered as matched. + Default: 0.5. + nms_iou_thr (float|optional): nms IoU threshold, the detection results + have done nms in the detector, only applied when users want to + change the nms IoU threshold. Default: None. + """ + true_positives = np.zeros(len(gts)) + gt_bboxes = [] + gt_labels = [] + for gt in gts: + gt_bboxes.append(gt['bbox']) + gt_labels.append(gt['bbox_label']) + + gt_bboxes = np.array(gt_bboxes) + gt_labels = np.array(gt_labels) + + unique_label = np.unique(result['labels'].numpy()) + + for det_label in unique_label: + mask = (result['labels'] == det_label) + det_bboxes = result['bboxes'][mask].numpy() + det_scores = result['scores'][mask].numpy() + + if nms_iou_thr: + det_bboxes, _ = nms( + det_bboxes, det_scores, nms_iou_thr, score_threshold=score_thr) + ious = bbox_overlaps(det_bboxes[:, :4], gt_bboxes) + for i, score in enumerate(det_scores): + det_match = 0 + if score >= score_thr: + for j, gt_label in enumerate(gt_labels): + if ious[i, j] >= tp_iou_thr: + det_match += 1 + if gt_label == det_label: + true_positives[j] += 1 # TP + confusion_matrix[gt_label, det_label] += 1 + if det_match == 0: # BG FP + confusion_matrix[-1, det_label] += 1 + for num_tp, gt_label in zip(true_positives, gt_labels): + if num_tp == 0: # FN + confusion_matrix[gt_label, -1] += 1 + + +def plot_confusion_matrix(confusion_matrix, + labels, + save_dir=None, + show=True, + title='Normalized Confusion Matrix', + color_theme='plasma'): + """Draw confusion matrix with matplotlib. + + Args: + confusion_matrix (ndarray): The confusion matrix. + labels (list[str]): List of class names. + save_dir (str|optional): If set, save the confusion matrix plot to the + given path. Default: None. + show (bool): Whether to show the plot. Default: True. + title (str): Title of the plot. Default: `Normalized Confusion Matrix`. + color_theme (str): Theme of the matrix color map. Default: `plasma`. + """ + # normalize the confusion matrix + per_label_sums = confusion_matrix.sum(axis=1)[:, np.newaxis] + confusion_matrix = \ + confusion_matrix.astype(np.float32) / per_label_sums * 100 + + num_classes = len(labels) + fig, ax = plt.subplots( + figsize=(0.5 * num_classes, 0.5 * num_classes * 0.8), dpi=180) + cmap = plt.get_cmap(color_theme) + im = ax.imshow(confusion_matrix, cmap=cmap) + plt.colorbar(mappable=im, ax=ax) + + title_font = {'weight': 'bold', 'size': 12} + ax.set_title(title, fontdict=title_font) + label_font = {'size': 10} + plt.ylabel('Ground Truth Label', fontdict=label_font) + plt.xlabel('Prediction Label', fontdict=label_font) + + # draw locator + xmajor_locator = MultipleLocator(1) + xminor_locator = MultipleLocator(0.5) + ax.xaxis.set_major_locator(xmajor_locator) + ax.xaxis.set_minor_locator(xminor_locator) + ymajor_locator = MultipleLocator(1) + yminor_locator = MultipleLocator(0.5) + ax.yaxis.set_major_locator(ymajor_locator) + ax.yaxis.set_minor_locator(yminor_locator) + + # draw grid + ax.grid(True, which='minor', linestyle='-') + + # draw label + ax.set_xticks(np.arange(num_classes)) + ax.set_yticks(np.arange(num_classes)) + ax.set_xticklabels(labels) + ax.set_yticklabels(labels) + + ax.tick_params( + axis='x', bottom=False, top=True, labelbottom=False, labeltop=True) + plt.setp( + ax.get_xticklabels(), rotation=45, ha='left', rotation_mode='anchor') + + # draw confution matrix value + for i in range(num_classes): + for j in range(num_classes): + ax.text( + j, + i, + '{}%'.format( + int(confusion_matrix[ + i, + j]) if not np.isnan(confusion_matrix[i, j]) else -1), + ha='center', + va='center', + color='w', + size=7) + + ax.set_ylim(len(confusion_matrix) - 0.5, -0.5) # matplotlib>3.1.1 + + fig.tight_layout() + if save_dir is not None: + plt.savefig( + os.path.join(save_dir, 'confusion_matrix.png'), format='png') + if show: + plt.show() + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + + # replace the ${key} with the value of cfg.key + cfg = replace_cfg_vals(cfg) + + # update data root according to MMYOLO_DATASETS + update_data_root(cfg) + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + results = load(args.prediction_path) + + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + + dataset = DATASETS.build(cfg.test_dataloader.dataset) + + confusion_matrix = calculate_confusion_matrix(dataset, results, + args.score_thr, + args.nms_iou_thr, + args.tp_iou_thr) + plot_confusion_matrix( + confusion_matrix, + dataset.metainfo['classes'] + ('background', ), + save_dir=args.save_dir, + show=args.show, + color_theme=args.color_theme) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/dataset_analysis.py b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/dataset_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..b2164e16b9809957b317b3c9406918292300707a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/dataset_analysis.py @@ -0,0 +1,498 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path +from statistics import median + +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt +import numpy as np +from mmengine.config import Config +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar +from prettytable import PrettyTable + +from mmyolo.registry import DATASETS +from mmyolo.utils.misc import show_data_classes + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Distribution of categories and bbox instances') + parser.add_argument('config', help='config file path') + parser.add_argument( + '--val-dataset', + default=False, + action='store_true', + help='The default train_dataset.' + 'To change it to val_dataset, enter "--val-dataset"') + parser.add_argument( + '--class-name', + default=None, + type=str, + help='Display specific class, e.g., "bicycle"') + parser.add_argument( + '--area-rule', + default=None, + type=int, + nargs='+', + help='Redefine area rules,but no more than three numbers.' + ' e.g., 30 70 125') + parser.add_argument( + '--func', + default=None, + type=str, + choices=[ + 'show_bbox_num', 'show_bbox_wh', 'show_bbox_wh_ratio', + 'show_bbox_area' + ], + help='Dataset analysis function selection.') + parser.add_argument( + '--out-dir', + default='./dataset_analysis', + type=str, + help='Output directory of dataset analysis visualization results,' + ' Save in "./dataset_analysis/" by default') + args = parser.parse_args() + return args + + +def show_bbox_num(cfg, out_dir, fig_set, class_name, class_num): + """Display the distribution map of categories and number of bbox + instances.""" + print('\n\nDrawing bbox_num figure:') + # Draw designs + fig = plt.figure( + figsize=(fig_set['figsize'][0], fig_set['figsize'][1]), dpi=300) + plt.bar(class_name, class_num, align='center') + + # Draw titles, labels and so on + for x, y in enumerate(class_num): + plt.text(x, y, '%s' % y, ha='center', fontsize=fig_set['fontsize'] + 3) + plt.xticks(rotation=fig_set['xticks_angle']) + plt.xlabel('Category Name') + plt.ylabel('Num of instances') + plt.title(cfg.dataset_type) + + # Save figure + if not os.path.exists(out_dir): + os.makedirs(out_dir) + out_name = fig_set['out_name'] + fig.savefig( + f'{out_dir}/{out_name}_bbox_num.jpg', + bbox_inches='tight', + pad_inches=0.1) # Save Image + plt.close() + print(f'End and save in {out_dir}/{out_name}_bbox_num.jpg') + + +def show_bbox_wh(out_dir, fig_set, class_bbox_w, class_bbox_h, class_name): + """Display the width and height distribution of categories and bbox + instances.""" + print('\n\nDrawing bbox_wh figure:') + # Draw designs + fig, ax = plt.subplots( + figsize=(fig_set['figsize'][0], fig_set['figsize'][1]), dpi=300) + + # Set the position of the map and label on the x-axis + positions_w = list(range(0, 12 * len(class_name), 12)) + positions_h = list(range(6, 12 * len(class_name), 12)) + positions_x_label = list(range(3, 12 * len(class_name) + 1, 12)) + ax.violinplot( + class_bbox_w, positions_w, showmeans=True, showmedians=True, widths=4) + ax.violinplot( + class_bbox_h, positions_h, showmeans=True, showmedians=True, widths=4) + + # Draw titles, labels and so on + plt.xticks(rotation=fig_set['xticks_angle']) + plt.ylabel('The width or height of bbox') + plt.xlabel('Class name') + plt.title('Width or height distribution of classes and bbox instances') + + # Draw the max, min and median of wide data in violin chart + for i in range(len(class_bbox_w)): + plt.text( + positions_w[i], + median(class_bbox_w[i]), + f'{"%.2f" % median(class_bbox_w[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions_w[i], + max(class_bbox_w[i]), + f'{"%.2f" % max(class_bbox_w[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions_w[i], + min(class_bbox_w[i]), + f'{"%.2f" % min(class_bbox_w[i])}', + ha='center', + fontsize=fig_set['fontsize']) + + # Draw the max, min and median of height data in violin chart + for i in range(len(positions_h)): + plt.text( + positions_h[i], + median(class_bbox_h[i]), + f'{"%.2f" % median(class_bbox_h[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions_h[i], + max(class_bbox_h[i]), + f'{"%.2f" % max(class_bbox_h[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions_h[i], + min(class_bbox_h[i]), + f'{"%.2f" % min(class_bbox_h[i])}', + ha='center', + fontsize=fig_set['fontsize']) + + # Draw Legend + plt.setp(ax, xticks=positions_x_label, xticklabels=class_name) + labels = ['bbox_w', 'bbox_h'] + colors = ['steelblue', 'darkorange'] + patches = [ + mpatches.Patch(color=colors[i], label=f'{labels[i]:s}') + for i in range(len(colors)) + ] + ax = plt.gca() + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width, box.height * 0.8]) + ax.legend(loc='upper center', handles=patches, ncol=2) + + # Save figure + if not os.path.exists(out_dir): + os.makedirs(out_dir) + out_name = fig_set['out_name'] + fig.savefig( + f'{out_dir}/{out_name}_bbox_wh.jpg', + bbox_inches='tight', + pad_inches=0.1) # Save Image + plt.close() + print(f'End and save in {out_dir}/{out_name}_bbox_wh.jpg') + + +def show_bbox_wh_ratio(out_dir, fig_set, class_name, class_bbox_ratio): + """Display the distribution map of category and bbox instance width and + height ratio.""" + print('\n\nDrawing bbox_wh_ratio figure:') + # Draw designs + fig, ax = plt.subplots( + figsize=(fig_set['figsize'][0], fig_set['figsize'][1]), dpi=300) + + # Set the position of the map and label on the x-axis + positions = list(range(0, 6 * len(class_name), 6)) + ax.violinplot( + class_bbox_ratio, + positions, + showmeans=True, + showmedians=True, + widths=5) + + # Draw titles, labels and so on + plt.xticks(rotation=fig_set['xticks_angle']) + plt.ylabel('Ratio of width to height of bbox') + plt.xlabel('Class name') + plt.title('Width to height ratio distribution of class and bbox instances') + + # Draw the max, min and median of wide data in violin chart + for i in range(len(class_bbox_ratio)): + plt.text( + positions[i], + median(class_bbox_ratio[i]), + f'{"%.2f" % median(class_bbox_ratio[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions[i], + max(class_bbox_ratio[i]), + f'{"%.2f" % max(class_bbox_ratio[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions[i], + min(class_bbox_ratio[i]), + f'{"%.2f" % min(class_bbox_ratio[i])}', + ha='center', + fontsize=fig_set['fontsize']) + + # Set the position of the map and label on the x-axis + plt.setp(ax, xticks=positions, xticklabels=class_name) + + # Save figure + if not os.path.exists(out_dir): + os.makedirs(out_dir) + out_name = fig_set['out_name'] + fig.savefig( + f'{out_dir}/{out_name}_bbox_ratio.jpg', + bbox_inches='tight', + pad_inches=0.1) # Save Image + plt.close() + print(f'End and save in {out_dir}/{out_name}_bbox_ratio.jpg') + + +def show_bbox_area(out_dir, fig_set, area_rule, class_name, bbox_area_num): + """Display the distribution map of category and bbox instance area based on + the rules of large, medium and small objects.""" + print('\n\nDrawing bbox_area figure:') + # Set the direct distance of each label and the width of each histogram + # Set the required labels and colors + positions = np.arange(0, 2 * len(class_name), 2) + width = 0.4 + labels = ['Small', 'Mediun', 'Large', 'Huge'] + colors = ['#438675', '#F7B469', '#6BA6DA', '#913221'] + + # Draw designs + fig = plt.figure( + figsize=(fig_set['figsize'][0], fig_set['figsize'][1]), dpi=300) + for i in range(len(area_rule) - 1): + area_num = [bbox_area_num[idx][i] for idx in range(len(class_name))] + plt.bar( + positions + width * i, + area_num, + width, + label=labels[i], + color=colors[i]) + for idx, (x, y) in enumerate(zip(positions.tolist(), area_num)): + plt.text( + x + width * i, + y, + y, + ha='center', + fontsize=fig_set['fontsize'] - 1) + + # Draw titles, labels and so on + plt.xticks(rotation=fig_set['xticks_angle']) + plt.xticks(positions + width * ((len(area_rule) - 2) / 2), class_name) + plt.ylabel('Class Area') + plt.xlabel('Class Name') + plt.title( + 'Area and number of large, medium and small objects of each class') + + # Set and Draw Legend + patches = [ + mpatches.Patch(color=colors[i], label=f'{labels[i]:s}') + for i in range(len(area_rule) - 1) + ] + ax = plt.gca() + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width, box.height * 0.8]) + ax.legend(loc='upper center', handles=patches, ncol=len(area_rule) - 1) + + # Save figure + if not os.path.exists(out_dir): + os.makedirs(out_dir) + out_name = fig_set['out_name'] + fig.savefig( + f'{out_dir}/{out_name}_bbox_area.jpg', + bbox_inches='tight', + pad_inches=0.1) # Save Image + plt.close() + print(f'End and save in {out_dir}/{out_name}_bbox_area.jpg') + + +def show_class_list(classes, class_num): + """Print the data of the class obtained by the current run.""" + print('\n\nThe information obtained is as follows:') + class_info = PrettyTable() + class_info.title = 'Information of dataset class' + # List Print Settings + # If the quantity is too large, 25 rows will be displayed in each column + if len(classes) < 25: + class_info.add_column('Class name', classes) + class_info.add_column('Bbox num', class_num) + elif len(classes) % 25 != 0 and len(classes) > 25: + col_num = int(len(classes) / 25) + 1 + class_nums = class_num.tolist() + class_name_list = list(classes) + for i in range(0, (col_num * 25) - len(classes)): + class_name_list.append('') + class_nums.append('') + for i in range(0, len(class_name_list), 25): + class_info.add_column('Class name', class_name_list[i:i + 25]) + class_info.add_column('Bbox num', class_nums[i:i + 25]) + + # Align display data to the left + class_info.align['Class name'] = 'l' + class_info.align['Bbox num'] = 'l' + print(class_info) + + +def show_data_list(args, area_rule): + """Print run setup information.""" + print('\n\nPrint current running information:') + data_info = PrettyTable() + data_info.title = 'Dataset information' + # Print the corresponding information according to the settings + if args.val_dataset is False: + data_info.add_column('Dataset type', ['train_dataset']) + elif args.val_dataset is True: + data_info.add_column('Dataset type', ['val_dataset']) + if args.class_name is None: + data_info.add_column('Class name', ['All classes']) + else: + data_info.add_column('Class name', [args.class_name]) + if args.func is None: + data_info.add_column('Function', ['All function']) + else: + data_info.add_column('Function', [args.func]) + data_info.add_column('Area rule', [area_rule]) + + print(data_info) + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + def replace_pipeline_to_none(cfg): + """Recursively iterate over all dataset(or datasets) and set their + pipelines to none.Datasets are mean ConcatDataset. + + Recursively terminates only when all dataset(or datasets) have been + traversed + """ + + if cfg.get('dataset', None) is None and cfg.get('datasets', + None) is None: + return + dataset = cfg.dataset if cfg.get('dataset', None) else cfg.datasets + if isinstance(dataset, list): + for item in dataset: + item.pipeline = None + elif dataset.get('pipeline', None): + dataset.pipeline = None + else: + replace_pipeline_to_none(dataset) + + # 1.Build Dataset + if args.val_dataset is False: + replace_pipeline_to_none(cfg.train_dataloader) + dataset = DATASETS.build(cfg.train_dataloader.dataset) + else: + replace_pipeline_to_none(cfg.val_dataloader) + dataset = DATASETS.build(cfg.val_dataloader.dataset) + + # 2.Prepare data + # Drawing settings + fig_all_set = { + 'figsize': [35, 18], + 'fontsize': int(10 - 0.08 * len(dataset.metainfo['classes'])), + 'xticks_angle': 70, + 'out_name': cfg.dataset_type + } + fig_one_set = { + 'figsize': [15, 10], + 'fontsize': 10, + 'xticks_angle': 0, + 'out_name': args.class_name + } + + # Call the category name and save address + if args.class_name is None: + classes = dataset.metainfo['classes'] + classes_idx = [i for i in range(len(classes))] + fig_set = fig_all_set + elif args.class_name in dataset.metainfo['classes']: + classes = [args.class_name] + classes_idx = [dataset.metainfo['classes'].index(args.class_name)] + fig_set = fig_one_set + else: + data_classes = dataset.metainfo['classes'] + show_data_classes(data_classes) + raise RuntimeError(f'Expected args.class_name to be one of the list,' + f'but got "{args.class_name}"') + + # Building Area Rules + if args.area_rule is None: + area_rule = [0, 32, 96, 1e5] + elif args.area_rule and len(args.area_rule) <= 3: + area_rules = [0] + args.area_rule + [1e5] + area_rule = sorted(area_rules) + else: + raise RuntimeError( + f'Expected the "{args.area_rule}" to be e.g. 30 60 120, ' + 'and no more than three numbers.') + + # Build arrays or lists to store data for each category + class_num = np.zeros((len(classes), ), dtype=np.int64) + class_bbox = [[] for _ in classes] + class_name = [] + class_bbox_w = [] + class_bbox_h = [] + class_bbox_ratio = [] + bbox_area_num = [] + + show_data_list(args, area_rule) + # Get the quantity and bbox data corresponding to each category + print('\nRead the information of each picture in the dataset:') + progress_bar = ProgressBar(len(dataset)) + for index in range(len(dataset)): + for instance in dataset[index]['instances']: + if instance[ + 'bbox_label'] in classes_idx and args.class_name is None: + class_num[instance['bbox_label']] += 1 + class_bbox[instance['bbox_label']].append(instance['bbox']) + elif instance['bbox_label'] in classes_idx and args.class_name: + class_num[0] += 1 + class_bbox[0].append(instance['bbox']) + progress_bar.update() + show_class_list(classes, class_num) + # Get the width, height and area of bbox corresponding to each category + print('\nRead bbox information in each class:') + progress_bar_classes = ProgressBar(len(classes)) + for idx, (classes, classes_idx) in enumerate(zip(classes, classes_idx)): + bbox = np.array(class_bbox[idx]) + bbox_area_nums = np.zeros((len(area_rule) - 1, ), dtype=np.int64) + if len(bbox) > 0: + bbox_wh = bbox[:, 2:4] - bbox[:, 0:2] + bbox_ratio = bbox_wh[:, 0] / bbox_wh[:, 1] + bbox_area = bbox_wh[:, 0] * bbox_wh[:, 1] + class_bbox_w.append(bbox_wh[:, 0].tolist()) + class_bbox_h.append(bbox_wh[:, 1].tolist()) + class_bbox_ratio.append(bbox_ratio.tolist()) + + # The area rule, there is an section between two numbers + for i in range(len(area_rule) - 1): + bbox_area_nums[i] = np.logical_and( + bbox_area >= area_rule[i]**2, + bbox_area < area_rule[i + 1]**2).sum() + elif len(bbox) == 0: + class_bbox_w.append([0]) + class_bbox_h.append([0]) + class_bbox_ratio.append([0]) + + class_name.append(classes) + bbox_area_num.append(bbox_area_nums.tolist()) + progress_bar_classes.update() + + # 3.draw Dataset Information + if args.func is None: + show_bbox_num(cfg, args.out_dir, fig_set, class_name, class_num) + show_bbox_wh(args.out_dir, fig_set, class_bbox_w, class_bbox_h, + class_name) + show_bbox_wh_ratio(args.out_dir, fig_set, class_name, class_bbox_ratio) + show_bbox_area(args.out_dir, fig_set, area_rule, class_name, + bbox_area_num) + elif args.func == 'show_bbox_num': + show_bbox_num(cfg, args.out_dir, fig_set, class_name, class_num) + elif args.func == 'show_bbox_wh': + show_bbox_wh(args.out_dir, fig_set, class_bbox_w, class_bbox_h, + class_name) + elif args.func == 'show_bbox_wh_ratio': + show_bbox_wh_ratio(args.out_dir, fig_set, class_name, class_bbox_ratio) + elif args.func == 'show_bbox_area': + show_bbox_area(args.out_dir, fig_set, area_rule, class_name, + bbox_area_num) + else: + raise RuntimeError( + 'Please enter the correct func name, e.g., show_bbox_num') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/get_flops.py b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/get_flops.py new file mode 100644 index 0000000000000000000000000000000000000000..965660f7194de231770537d7f80e38f41876df56 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/get_flops.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import tempfile +from pathlib import Path + +import torch +from mmdet.registry import MODELS +from mmengine.analysis import get_model_complexity_info +from mmengine.config import Config, DictAction +from mmengine.logging import MMLogger +from mmengine.model import revert_sync_batchnorm +from mmengine.registry import init_default_scope + +from mmyolo.utils import switch_to_deploy + + +def parse_args(): + parser = argparse.ArgumentParser(description='Get a detector flops') + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--shape', + type=int, + nargs='+', + default=[640, 640], + help='input image size') + parser.add_argument( + '--show-arch', + action='store_true', + help='whether return the statistics in the form of network layers') + parser.add_argument( + '--not-show-table', + action='store_true', + help='whether return the statistics in the form of table'), + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + return parser.parse_args() + + +def inference(args, logger): + config_name = Path(args.config) + if not config_name.exists(): + logger.error(f'{config_name} not found.') + + cfg = Config.fromfile(args.config) + cfg.work_dir = tempfile.TemporaryDirectory().name + cfg.log_level = 'WARN' + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + if len(args.shape) == 1: + h = w = args.shape[0] + elif len(args.shape) == 2: + h, w = args.shape + else: + raise ValueError('invalid input shape') + + # model + model = MODELS.build(cfg.model) + if torch.cuda.is_available(): + model.cuda() + model = revert_sync_batchnorm(model) + model.eval() + switch_to_deploy(model) + + # input tensor + # automatically generate a input tensor with the given input_shape. + data_batch = {'inputs': [torch.rand(3, h, w)], 'batch_samples': [None]} + data = model.data_preprocessor(data_batch) + result = {'ori_shape': (h, w), 'pad_shape': data['inputs'].shape[-2:]} + outputs = get_model_complexity_info( + model, + input_shape=None, + inputs=data['inputs'], # the input tensor of the model + show_table=not args.not_show_table, # show the complexity table + show_arch=args.show_arch) # show the complexity arch + + result['flops'] = outputs['flops_str'] + result['params'] = outputs['params_str'] + result['out_table'] = outputs['out_table'] + result['out_arch'] = outputs['out_arch'] + + return result + + +def main(): + args = parse_args() + logger = MMLogger.get_instance(name='MMLogger') + result = inference(args, logger) + + split_line = '=' * 30 + + ori_shape = result['ori_shape'] + pad_shape = result['pad_shape'] + flops = result['flops'] + params = result['params'] + + print(result['out_table']) # print related information by table + print(result['out_arch']) # print related information by network layers + + if pad_shape != ori_shape: + print(f'{split_line}\nUse size divisor set input shape ' + f'from {ori_shape} to {pad_shape}') + + print(f'{split_line}\n' + f'Input shape: {pad_shape}\nModel Flops: {flops}\n' + f'Model Parameters: {params}\n{split_line}') + print('!!!Please be cautious if you use the results in papers. ' + 'You may need to check if all ops are supported and verify ' + 'that the flops computation is correct.') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/optimize_anchors.py b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/optimize_anchors.py new file mode 100644 index 0000000000000000000000000000000000000000..34d4d067a6470a610b53868f18203827676892a2 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/optimize_anchors.py @@ -0,0 +1,647 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Optimize anchor settings on a specific dataset. + +This script provides three methods to optimize YOLO anchors including k-means +anchor cluster, differential evolution and v5-k-means. You can use +``--algorithm k-means``, ``--algorithm differential_evolution`` and +``--algorithm v5-k-means`` to switch those methods. + +Example: + Use k-means anchor cluster:: + + python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm k-means --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --out-dir ${OUT_DIR} + + Use differential evolution to optimize anchors:: + + python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm differential_evolution \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --out-dir ${OUT_DIR} + + Use v5-k-means to optimize anchors:: + + python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm v5-k-means \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --prior_match_thr ${PRIOR_MATCH_THR} \ + --out-dir ${OUT_DIR} +""" +import argparse +import os.path as osp +import random +from typing import Tuple + +import numpy as np +import torch +from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps, + bbox_xyxy_to_cxcywh) +from mmdet.utils import replace_cfg_vals, update_data_root +from mmengine.config import Config +from mmengine.fileio import dump +from mmengine.logging import MMLogger +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar +from scipy.optimize import differential_evolution +from torch import Tensor + +from mmyolo.registry import DATASETS + +try: + from scipy.cluster.vq import kmeans +except ImportError: + kmeans = None + + +def parse_args(): + parser = argparse.ArgumentParser(description='Optimize anchor parameters.') + parser.add_argument('config', help='Train config file path.') + parser.add_argument( + '--input-shape', + type=int, + nargs='+', + default=[640, 640], + help='input image size, represent [width, height]') + parser.add_argument( + '--algorithm', + default='DE', + help='Algorithm used for anchor optimizing.' + 'Support k-means and differential_evolution for YOLO,' + 'and v5-k-means is special for YOLOV5.') + parser.add_argument( + '--iters', + default=1000, + type=int, + help='Maximum iterations for optimizer.') + parser.add_argument( + '--prior-match-thr', + default=4.0, + type=float, + help='anchor-label `gt_filter_sizes` ratio threshold ' + 'hyperparameter used for training, default=4.0, this ' + 'parameter is unique to v5-k-means') + parser.add_argument( + '--mutation-args', + type=float, + nargs='+', + default=[0.9, 0.1], + help='paramter of anchor optimize method genetic algorithm, ' + 'represent [prob, sigma], this parameter is unique to v5-k-means') + parser.add_argument( + '--augment-args', + type=float, + nargs='+', + default=[0.9, 1.1], + help='scale factor of box size augment when metric box and anchor, ' + 'represent [min, max], this parameter is unique to v5-k-means') + parser.add_argument( + '--device', default='cuda:0', help='Device used for calculating.') + parser.add_argument( + '--out-dir', + default=None, + type=str, + help='Path to save anchor optimize result.') + + args = parser.parse_args() + return args + + +class BaseAnchorOptimizer: + """Base class for anchor optimizer. + + Args: + dataset (obj:`Dataset`): Dataset object. + input_shape (list[int]): Input image shape of the model. + Format in [width, height]. + num_anchor_per_level (list[int]) : Number of anchors for each level. + logger (obj:`logging.Logger`): The logger for logging. + device (str, optional): Device used for calculating. + Default: 'cuda:0' + out_dir (str, optional): Path to save anchor optimize result. + Default: None + """ + + def __init__(self, + dataset, + input_shape, + num_anchor_per_level, + logger, + device='cuda:0', + out_dir=None): + self.dataset = dataset + self.input_shape = input_shape + self.num_anchor_per_level = num_anchor_per_level + self.num_anchors = sum(num_anchor_per_level) + self.logger = logger + self.device = device + self.out_dir = out_dir + bbox_whs, img_shapes = self.get_whs_and_shapes() + ratios = img_shapes.max(1, keepdims=True) / np.array([input_shape]) + + # resize to input shape + self.bbox_whs = bbox_whs / ratios + + def get_whs_and_shapes(self): + """Get widths and heights of bboxes and shapes of images. + + Returns: + tuple[np.ndarray]: Array of bbox shapes and array of image + shapes with shape (num_bboxes, 2) in [width, height] format. + """ + self.logger.info('Collecting bboxes from annotation...') + bbox_whs = [] + img_shapes = [] + prog_bar = ProgressBar(len(self.dataset)) + for idx in range(len(self.dataset)): + data_info = self.dataset.get_data_info(idx) + img_shape = np.array([data_info['width'], data_info['height']]) + gt_instances = data_info['instances'] + for instance in gt_instances: + bbox = np.array(instance['bbox']) + gt_filter_sizes = bbox[2:4] - bbox[0:2] + img_shapes.append(img_shape) + bbox_whs.append(gt_filter_sizes) + + prog_bar.update() + print('\n') + bbox_whs = np.array(bbox_whs) + img_shapes = np.array(img_shapes) + self.logger.info(f'Collected {bbox_whs.shape[0]} bboxes.') + return bbox_whs, img_shapes + + def get_zero_center_bbox_tensor(self): + """Get a tensor of bboxes centered at (0, 0). + + Returns: + Tensor: Tensor of bboxes with shape (num_bboxes, 4) + in [xmin, ymin, xmax, ymax] format. + """ + whs = torch.from_numpy(self.bbox_whs).to( + self.device, dtype=torch.float32) + bboxes = bbox_cxcywh_to_xyxy( + torch.cat([torch.zeros_like(whs), whs], dim=1)) + return bboxes + + def optimize(self): + raise NotImplementedError + + def save_result(self, anchors, path=None): + + anchor_results = [] + start = 0 + for num in self.num_anchor_per_level: + end = num + start + anchor_results.append([(round(w), round(h)) + for w, h in anchors[start:end]]) + start = end + + self.logger.info(f'Anchor optimize result:{anchor_results}') + if path: + json_path = osp.join(path, 'anchor_optimize_result.json') + dump(anchor_results, json_path) + self.logger.info(f'Result saved in {json_path}') + + +class YOLOKMeansAnchorOptimizer(BaseAnchorOptimizer): + r"""YOLO anchor optimizer using k-means. Code refer to `AlexeyAB/darknet. + `_. + + Args: + iters (int): Maximum iterations for k-means. + """ + + def __init__(self, iters, **kwargs): + + super().__init__(**kwargs) + self.iters = iters + + def optimize(self): + anchors = self.kmeans_anchors() + self.save_result(anchors, self.out_dir) + + def kmeans_anchors(self): + self.logger.info( + f'Start cluster {self.num_anchors} YOLO anchors with K-means...') + bboxes = self.get_zero_center_bbox_tensor() + cluster_center_idx = torch.randint( + 0, bboxes.shape[0], (self.num_anchors, )).to(self.device) + + assignments = torch.zeros((bboxes.shape[0], )).to(self.device) + cluster_centers = bboxes[cluster_center_idx] + if self.num_anchors == 1: + cluster_centers = self.kmeans_maximization(bboxes, assignments, + cluster_centers) + anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy() + anchors = sorted(anchors, key=lambda x: x[0] * x[1]) + return anchors + + prog_bar = ProgressBar(self.iters) + for i in range(self.iters): + converged, assignments = self.kmeans_expectation( + bboxes, assignments, cluster_centers) + if converged: + self.logger.info(f'K-means process has converged at iter {i}.') + break + cluster_centers = self.kmeans_maximization(bboxes, assignments, + cluster_centers) + prog_bar.update() + print('\n') + avg_iou = bbox_overlaps(bboxes, + cluster_centers).max(1)[0].mean().item() + + anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy() + anchors = sorted(anchors, key=lambda x: x[0] * x[1]) + self.logger.info(f'Anchor cluster finish. Average IOU: {avg_iou}') + + return anchors + + def kmeans_maximization(self, bboxes, assignments, centers): + """Maximization part of EM algorithm(Expectation-Maximization)""" + new_centers = torch.zeros_like(centers) + for i in range(centers.shape[0]): + mask = (assignments == i) + if mask.sum(): + new_centers[i, :] = bboxes[mask].mean(0) + return new_centers + + def kmeans_expectation(self, bboxes, assignments, centers): + """Expectation part of EM algorithm(Expectation-Maximization)""" + ious = bbox_overlaps(bboxes, centers) + closest = ious.argmax(1) + converged = (closest == assignments).all() + return converged, closest + + +class YOLOV5KMeansAnchorOptimizer(BaseAnchorOptimizer): + r"""YOLOv5 anchor optimizer using shape k-means. + Code refer to `ultralytics/yolov5. + `_. + + Args: + iters (int): Maximum iterations for k-means. + prior_match_thr (float): anchor-label width height + ratio threshold hyperparameter. + """ + + def __init__(self, + iters, + prior_match_thr=4.0, + mutation_args=[0.9, 0.1], + augment_args=[0.9, 1.1], + **kwargs): + + super().__init__(**kwargs) + self.iters = iters + self.prior_match_thr = prior_match_thr + [self.mutation_prob, self.mutation_sigma] = mutation_args + [self.augment_min, self.augment_max] = augment_args + + def optimize(self): + self.logger.info( + f'Start cluster {self.num_anchors} YOLOv5 anchors with K-means...') + + bbox_whs = torch.from_numpy(self.bbox_whs).to( + self.device, dtype=torch.float32) + anchors = self.anchor_generate( + bbox_whs, + num=self.num_anchors, + img_size=self.input_shape[0], + prior_match_thr=self.prior_match_thr, + iters=self.iters) + best_ratio, mean_matched = self.anchor_metric(bbox_whs, anchors) + self.logger.info(f'{mean_matched:.2f} anchors/target {best_ratio:.3f} ' + 'Best Possible Recall (BPR). ') + self.save_result(anchors.tolist(), self.out_dir) + + def anchor_generate(self, + box_size: Tensor, + num: int = 9, + img_size: int = 640, + prior_match_thr: float = 4.0, + iters: int = 1000) -> Tensor: + """cluster boxes metric with anchors. + + Args: + box_size (Tensor): The size of the bxes, which shape is + (box_num, 2),the number 2 means width and height. + num (int): number of anchors. + img_size (int): image size used for training + prior_match_thr (float): width/height ratio threshold + used for training + iters (int): iterations to evolve anchors using genetic algorithm + + Returns: + anchors (Tensor): kmeans evolved anchors + """ + + thr = 1 / prior_match_thr + + # step1: filter small bbox + box_size = self._filter_box(box_size) + assert num <= len(box_size) + + # step2: init anchors + if kmeans: + try: + self.logger.info( + 'beginning init anchors with scipy kmeans method') + # sigmas for whitening + sigmas = box_size.std(0).cpu().numpy() + anchors = kmeans( + box_size.cpu().numpy() / sigmas, num, iter=30)[0] * sigmas + # kmeans may return fewer points than requested + # if width/height is insufficient or too similar + assert num == len(anchors) + except Exception: + self.logger.warning( + 'scipy kmeans method cannot get enough points ' + 'because of width/height is insufficient or too similar, ' + 'now switching strategies from kmeans to random init.') + anchors = np.sort(np.random.rand(num * 2)).reshape( + num, 2) * img_size + else: + self.logger.info( + 'cannot found scipy package, switching strategies from kmeans ' + 'to random init, you can install scipy package to ' + 'get better anchor init') + anchors = np.sort(np.random.rand(num * 2)).reshape(num, + 2) * img_size + + self.logger.info('init done, beginning evolve anchors...') + # sort small to large + anchors = torch.tensor(anchors[np.argsort(anchors.prod(1))]).to( + box_size.device, dtype=torch.float32) + + # step3: evolve anchors use Genetic Algorithm + prog_bar = ProgressBar(iters) + fitness = self._anchor_fitness(box_size, anchors, thr) + cluster_shape = anchors.shape + + for _ in range(iters): + mutate_result = np.ones(cluster_shape) + # mutate until a change occurs (prevent duplicates) + while (mutate_result == 1).all(): + # mutate_result is scale factor of anchors, between 0.3 and 3 + mutate_result = ( + (np.random.random(cluster_shape) < self.mutation_prob) * + random.random() * np.random.randn(*cluster_shape) * + self.mutation_sigma + 1).clip(0.3, 3.0) + mutate_result = torch.from_numpy(mutate_result).to(box_size.device) + new_anchors = (anchors.clone() * mutate_result).clip(min=2.0) + new_fitness = self._anchor_fitness(box_size, new_anchors, thr) + if new_fitness > fitness: + fitness = new_fitness + anchors = new_anchors.clone() + + prog_bar.update() + print('\n') + # sort small to large + anchors = anchors[torch.argsort(anchors.prod(1))] + self.logger.info(f'Anchor cluster finish. fitness = {fitness:.4f}') + + return anchors + + def anchor_metric(self, + box_size: Tensor, + anchors: Tensor, + threshold: float = 4.0) -> Tuple: + """compute boxes metric with anchors. + + Args: + box_size (Tensor): The size of the bxes, which shape + is (box_num, 2), the number 2 means width and height. + anchors (Tensor): The size of the bxes, which shape + is (anchor_num, 2), the number 2 means width and height. + threshold (float): the compare threshold of ratio + + Returns: + Tuple: a tuple of metric result, best_ratio_mean and mean_matched + """ + # step1: augment scale + # According to the uniform distribution,the scaling scale between + # augment_min and augment_max is randomly generated + scale = np.random.uniform( + self.augment_min, self.augment_max, size=(box_size.shape[0], 1)) + box_size = torch.tensor( + np.array( + [l[:, ] * s for s, l in zip(scale, + box_size.cpu().numpy())])).to( + box_size.device, + dtype=torch.float32) + # step2: calculate ratio + min_ratio, best_ratio = self._metric(box_size, anchors) + mean_matched = (min_ratio > 1 / threshold).float().sum(1).mean() + best_ratio_mean = (best_ratio > 1 / threshold).float().mean() + return best_ratio_mean, mean_matched + + def _filter_box(self, box_size: Tensor) -> Tensor: + small_cnt = (box_size < 3.0).any(1).sum() + if small_cnt: + self.logger.warning( + f'Extremely small objects found: {small_cnt} ' + f'of {len(box_size)} labels are <3 pixels in size') + # filter > 2 pixels + filter_sizes = box_size[(box_size >= 2.0).any(1)] + return filter_sizes + + def _anchor_fitness(self, box_size: Tensor, anchors: Tensor, thr: float): + """mutation fitness.""" + _, best = self._metric(box_size, anchors) + return (best * (best > thr).float()).mean() + + def _metric(self, box_size: Tensor, anchors: Tensor) -> Tuple: + """compute boxes metric with anchors. + + Args: + box_size (Tensor): The size of the bxes, which shape is + (box_num, 2), the number 2 means width and height. + anchors (Tensor): The size of the bxes, which shape is + (anchor_num, 2), the number 2 means width and height. + + Returns: + Tuple: a tuple of metric result, min_ratio and best_ratio + """ + + # ratio means the (width_1/width_2 and height_1/height_2) ratio of each + # box and anchor, the ratio shape is torch.Size([box_num,anchor_num,2]) + ratio = box_size[:, None] / anchors[None] + + # min_ratio records the min ratio of each box with all anchor, + # min_ratio.shape is torch.Size([box_num,anchor_num]) + # notice: + # smaller ratio means worse shape-match between boxes and anchors + min_ratio = torch.min(ratio, 1 / ratio).min(2)[0] + + # find the best shape-match ratio for each box + # box_best_ratio.shape is torch.Size([box_num]) + best_ratio = min_ratio.max(1)[0] + + return min_ratio, best_ratio + + +class YOLODEAnchorOptimizer(BaseAnchorOptimizer): + """YOLO anchor optimizer using differential evolution algorithm. + + Args: + iters (int): Maximum iterations for k-means. + strategy (str): The differential evolution strategy to use. + Should be one of: + + - 'best1bin' + - 'best1exp' + - 'rand1exp' + - 'randtobest1exp' + - 'currenttobest1exp' + - 'best2exp' + - 'rand2exp' + - 'randtobest1bin' + - 'currenttobest1bin' + - 'best2bin' + - 'rand2bin' + - 'rand1bin' + + Default: 'best1bin'. + population_size (int): Total population size of evolution algorithm. + Default: 15. + convergence_thr (float): Tolerance for convergence, the + optimizing stops when ``np.std(pop) <= abs(convergence_thr) + + convergence_thr * np.abs(np.mean(population_energies))``, + respectively. Default: 0.0001. + mutation (tuple[float]): Range of dithering randomly changes the + mutation constant. Default: (0.5, 1). + recombination (float): Recombination constant of crossover probability. + Default: 0.7. + """ + + def __init__(self, + iters, + strategy='best1bin', + population_size=15, + convergence_thr=0.0001, + mutation=(0.5, 1), + recombination=0.7, + **kwargs): + + super().__init__(**kwargs) + + self.iters = iters + self.strategy = strategy + self.population_size = population_size + self.convergence_thr = convergence_thr + self.mutation = mutation + self.recombination = recombination + + def optimize(self): + anchors = self.differential_evolution() + self.save_result(anchors, self.out_dir) + + def differential_evolution(self): + bboxes = self.get_zero_center_bbox_tensor() + + bounds = [] + for i in range(self.num_anchors): + bounds.extend([(0, self.input_shape[0]), (0, self.input_shape[1])]) + + result = differential_evolution( + func=self.avg_iou_cost, + bounds=bounds, + args=(bboxes, ), + strategy=self.strategy, + maxiter=self.iters, + popsize=self.population_size, + tol=self.convergence_thr, + mutation=self.mutation, + recombination=self.recombination, + updating='immediate', + disp=True) + self.logger.info( + f'Anchor evolution finish. Average IOU: {1 - result.fun}') + anchors = [(w, h) for w, h in zip(result.x[::2], result.x[1::2])] + anchors = sorted(anchors, key=lambda x: x[0] * x[1]) + return anchors + + @staticmethod + def avg_iou_cost(anchor_params, bboxes): + assert len(anchor_params) % 2 == 0 + anchor_whs = torch.tensor( + [[w, h] + for w, h in zip(anchor_params[::2], anchor_params[1::2])]).to( + bboxes.device, dtype=bboxes.dtype) + anchor_boxes = bbox_cxcywh_to_xyxy( + torch.cat([torch.zeros_like(anchor_whs), anchor_whs], dim=1)) + ious = bbox_overlaps(bboxes, anchor_boxes) + max_ious, _ = ious.max(1) + cost = 1 - max_ious.mean().item() + return cost + + +def main(): + logger = MMLogger.get_current_instance() + args = parse_args() + cfg = args.config + cfg = Config.fromfile(cfg) + + # replace the ${key} with the value of cfg.key + cfg = replace_cfg_vals(cfg) + + # update data root according to MMDET_DATASETS + update_data_root(cfg) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + input_shape = args.input_shape + assert len(input_shape) == 2 + + anchor_type = cfg.model.bbox_head.prior_generator.type + assert anchor_type == 'mmdet.YOLOAnchorGenerator', \ + f'Only support optimize YOLOAnchor, but get {anchor_type}.' + + base_sizes = cfg.model.bbox_head.prior_generator.base_sizes + num_anchor_per_level = [len(sizes) for sizes in base_sizes] + + train_data_cfg = cfg.train_dataloader + while 'dataset' in train_data_cfg: + train_data_cfg = train_data_cfg['dataset'] + dataset = DATASETS.build(train_data_cfg) + + if args.algorithm == 'k-means': + optimizer = YOLOKMeansAnchorOptimizer( + dataset=dataset, + input_shape=input_shape, + device=args.device, + num_anchor_per_level=num_anchor_per_level, + iters=args.iters, + logger=logger, + out_dir=args.out_dir) + elif args.algorithm == 'DE': + optimizer = YOLODEAnchorOptimizer( + dataset=dataset, + input_shape=input_shape, + device=args.device, + num_anchor_per_level=num_anchor_per_level, + iters=args.iters, + logger=logger, + out_dir=args.out_dir) + elif args.algorithm == 'v5-k-means': + optimizer = YOLOV5KMeansAnchorOptimizer( + dataset=dataset, + input_shape=input_shape, + device=args.device, + num_anchor_per_level=num_anchor_per_level, + iters=args.iters, + prior_match_thr=args.prior_match_thr, + mutation_args=args.mutation_args, + augment_args=args.augment_args, + logger=logger, + out_dir=args.out_dir) + else: + raise NotImplementedError( + f'Only support k-means and differential_evolution, ' + f'but get {args.algorithm}') + + optimizer.optimize() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/vis_scheduler.py b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/vis_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..8a2922d890d68e0be54925fc18c8afd43a4451f3 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/analysis_tools/vis_scheduler.py @@ -0,0 +1,295 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Hyper-parameter Scheduler Visualization. + +This tool aims to help the user to check +the hyper-parameter scheduler of the optimizer(without training), +which support the "learning rate", "momentum", and "weight_decay". + +Example: +```shell +python tools/analysis_tools/vis_scheduler.py \ + configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py \ + --dataset-size 118287 \ + --ngpus 8 \ + --out-dir ./output +``` +Modified from: https://github.com/open-mmlab/mmclassification/blob/1.x/tools/visualizations/vis_scheduler.py # noqa +""" +import argparse +import json +import os.path as osp +import re +from pathlib import Path +from unittest.mock import MagicMock + +import matplotlib.pyplot as plt +import rich +import torch.nn as nn +from mmengine.config import Config, DictAction +from mmengine.hooks import Hook +from mmengine.model import BaseModel +from mmengine.registry import init_default_scope +from mmengine.runner import Runner +from mmengine.utils.path import mkdir_or_exist +from mmengine.visualization import Visualizer +from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Visualize a hyper-parameter scheduler') + parser.add_argument('config', help='config file path') + parser.add_argument( + '-p', + '--parameter', + type=str, + default='lr', + choices=['lr', 'momentum', 'wd'], + help='The parameter to visualize its change curve, choose from' + '"lr", "wd" and "momentum". Defaults to "lr".') + parser.add_argument( + '-d', + '--dataset-size', + type=int, + help='The size of the dataset. If specify, `DATASETS.build` will ' + 'be skipped and use this size as the dataset size.') + parser.add_argument( + '-n', + '--ngpus', + type=int, + default=1, + help='The number of GPUs used in training.') + parser.add_argument( + '-o', '--out-dir', type=Path, help='Path to output file') + parser.add_argument( + '--log-level', + default='WARNING', + help='The log level of the handler and logger. Defaults to ' + 'WARNING.') + parser.add_argument('--title', type=str, help='title of figure') + parser.add_argument( + '--style', type=str, default='whitegrid', help='style of plt') + parser.add_argument('--not-show', default=False, action='store_true') + parser.add_argument( + '--window-size', + default='12*7', + help='Size of the window to display images, in format of "$W*$H".') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + if args.window_size != '': + assert re.match(r'\d+\*\d+', args.window_size), \ + "'window-size' must be in format 'W*H'." + + return args + + +class SimpleModel(BaseModel): + """simple model that do nothing in train_step.""" + + def __init__(self): + super().__init__() + self.data_preprocessor = nn.Identity() + self.conv = nn.Conv2d(1, 1, 1) + + def forward(self, inputs, data_samples, mode='tensor'): + pass + + def train_step(self, data, optim_wrapper): + pass + + +class ParamRecordHook(Hook): + + def __init__(self, by_epoch): + super().__init__() + self.by_epoch = by_epoch + self.lr_list = [] + self.momentum_list = [] + self.wd_list = [] + self.task_id = 0 + self.progress = Progress(BarColumn(), MofNCompleteColumn(), + TextColumn('{task.description}')) + + def before_train(self, runner): + if self.by_epoch: + total = runner.train_loop.max_epochs + self.task_id = self.progress.add_task( + 'epochs', start=True, total=total) + else: + total = runner.train_loop.max_iters + self.task_id = self.progress.add_task( + 'iters', start=True, total=total) + self.progress.start() + + def after_train_epoch(self, runner): + if self.by_epoch: + self.progress.update(self.task_id, advance=1) + + # TODO: Support multiple schedulers + def after_train_iter(self, runner, batch_idx, data_batch, outputs): + if not self.by_epoch: + self.progress.update(self.task_id, advance=1) + self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0]) + self.momentum_list.append( + runner.optim_wrapper.get_momentum()['momentum'][0]) + self.wd_list.append( + runner.optim_wrapper.param_groups[0]['weight_decay']) + + def after_train(self, runner): + self.progress.stop() + + +def plot_curve(lr_list, args, param_name, iters_per_epoch, by_epoch=True): + """Plot learning rate vs iter graph.""" + try: + import seaborn as sns + sns.set_style(args.style) + except ImportError: + pass + + wind_w, wind_h = args.window_size.split('*') + wind_w, wind_h = int(wind_w), int(wind_h) + plt.figure(figsize=(wind_w, wind_h)) + + ax: plt.Axes = plt.subplot() + ax.plot(lr_list, linewidth=1) + + if by_epoch: + ax.xaxis.tick_top() + ax.set_xlabel('Iters') + ax.xaxis.set_label_position('top') + sec_ax = ax.secondary_xaxis( + 'bottom', + functions=(lambda x: x / iters_per_epoch, + lambda y: y * iters_per_epoch)) + sec_ax.set_xlabel('Epochs') + else: + plt.xlabel('Iters') + plt.ylabel(param_name) + + if args.title is None: + plt.title(f'{osp.basename(args.config)} {param_name} curve') + else: + plt.title(args.title) + + +def simulate_train(data_loader, cfg, by_epoch): + model = SimpleModel() + param_record_hook = ParamRecordHook(by_epoch=by_epoch) + default_hooks = dict( + param_scheduler=cfg.default_hooks['param_scheduler'], + runtime_info=None, + timer=None, + logger=None, + checkpoint=None, + sampler_seed=None, + param_record=param_record_hook) + + runner = Runner( + model=model, + work_dir=cfg.work_dir, + train_dataloader=data_loader, + train_cfg=cfg.train_cfg, + log_level=cfg.log_level, + optim_wrapper=cfg.optim_wrapper, + param_scheduler=cfg.param_scheduler, + default_scope=cfg.default_scope, + default_hooks=default_hooks, + visualizer=MagicMock(spec=Visualizer), + custom_hooks=cfg.get('custom_hooks', None)) + + runner.train() + + param_dict = dict( + lr=param_record_hook.lr_list, + momentum=param_record_hook.momentum_list, + wd=param_record_hook.wd_list) + + return param_dict + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + if cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.log_level = args.log_level + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + # init logger + print('Param_scheduler :') + rich.print_json(json.dumps(cfg.param_scheduler)) + + # prepare data loader + batch_size = cfg.train_dataloader.batch_size * args.ngpus + + if 'by_epoch' in cfg.train_cfg: + by_epoch = cfg.train_cfg.get('by_epoch') + elif 'type' in cfg.train_cfg: + by_epoch = cfg.train_cfg.get('type') == 'EpochBasedTrainLoop' + else: + raise ValueError('please set `train_cfg`.') + + if args.dataset_size is None and by_epoch: + from mmyolo.registry import DATASETS + dataset_size = len(DATASETS.build(cfg.train_dataloader.dataset)) + else: + dataset_size = args.dataset_size or batch_size + + class FakeDataloader(list): + dataset = MagicMock(metainfo=None) + + data_loader = FakeDataloader(range(dataset_size // batch_size)) + dataset_info = ( + f'\nDataset infos:' + f'\n - Dataset size: {dataset_size}' + f'\n - Batch size per GPU: {cfg.train_dataloader.batch_size}' + f'\n - Number of GPUs: {args.ngpus}' + f'\n - Total batch size: {batch_size}') + if by_epoch: + dataset_info += f'\n - Iterations per epoch: {len(data_loader)}' + rich.print(dataset_info + '\n') + + # simulation training process + param_dict = simulate_train(data_loader, cfg, by_epoch) + param_list = param_dict[args.parameter] + + if args.parameter == 'lr': + param_name = 'Learning Rate' + elif args.parameter == 'momentum': + param_name = 'Momentum' + else: + param_name = 'Weight Decay' + plot_curve(param_list, args, param_name, len(data_loader), by_epoch) + + if args.out_dir: + # make dir for output + mkdir_or_exist(args.out_dir) + + # save the graph + out_file = osp.join( + args.out_dir, f'{osp.basename(args.config)}-{args.parameter}.jpg') + plt.savefig(out_file) + print(f'\nThe {param_name} graph is saved at {out_file}') + + if not args.not_show: + plt.show() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/balloon2coco.py b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/balloon2coco.py new file mode 100644 index 0000000000000000000000000000000000000000..65eb660cb09f850bafb1e743ff840b14200fa975 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/balloon2coco.py @@ -0,0 +1,58 @@ +import os.path as osp + +import mmcv +import mmengine + + +def convert_balloon_to_coco(ann_file, out_file, image_prefix): + + data_infos = mmengine.load(ann_file) + + annotations = [] + images = [] + obj_count = 0 + for idx, v in enumerate(mmengine.track_iter_progress(data_infos.values())): + filename = v['filename'] + img_path = osp.join(image_prefix, filename) + height, width = mmcv.imread(img_path).shape[:2] + + images.append( + dict(id=idx, file_name=filename, height=height, width=width)) + + for _, obj in v['regions'].items(): + assert not obj['region_attributes'] + obj = obj['shape_attributes'] + px = obj['all_points_x'] + py = obj['all_points_y'] + poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)] + poly = [p for x in poly for p in x] + + x_min, y_min, x_max, y_max = (min(px), min(py), max(px), max(py)) + + data_anno = dict( + image_id=idx, + id=obj_count, + category_id=0, + bbox=[x_min, y_min, x_max - x_min, y_max - y_min], + area=(x_max - x_min) * (y_max - y_min), + segmentation=[poly], + iscrowd=0) + annotations.append(data_anno) + obj_count += 1 + + coco_format_json = dict( + images=images, + annotations=annotations, + categories=[{ + 'id': 0, + 'name': 'balloon' + }]) + mmengine.dump(coco_format_json, out_file) + + +if __name__ == '__main__': + + convert_balloon_to_coco('data/balloon/train/via_region_data.json', + 'data/balloon/train.json', 'data/balloon/train/') + convert_balloon_to_coco('data/balloon/val/via_region_data.json', + 'data/balloon/val.json', 'data/balloon/val/') diff --git a/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/README.md b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a166e2793a0aeb0d08d9e19e7bd5abfd8d8240e5 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/README.md @@ -0,0 +1,3 @@ +# Preparing DOTA Dataset + +Please refer to [Dataset preparation and description](../../../docs/en/recommended_topics/dataset_preparation.md) diff --git a/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/dota_split.py b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/dota_split.py new file mode 100644 index 0000000000000000000000000000000000000000..0418e9d3c9a7c87a04b825c152f4784f2a7150fa --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/dota_split.py @@ -0,0 +1,603 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Reference: https://github.com/jbwang1997/BboxToolkit + +import argparse +import codecs +import datetime +import itertools +import os +import os.path as osp +import time +from functools import partial, reduce +from math import ceil +from multiprocessing import Manager, Pool +from typing import List, Sequence + +import cv2 +import numpy as np +from mmengine import Config, MMLogger, mkdir_or_exist, print_log +from PIL import Image + +Image.MAX_IMAGE_PIXELS = None + +try: + import shapely.geometry as shgeo +except ImportError: + raise ImportError('Please run "pip install shapely" ' + 'to install shapely first.') + +PHASE_REQUIRE_SETS = dict( + trainval=['train', 'val'], + train=[ + 'train', + ], + val=[ + 'val', + ], + test=[ + 'test', + ], +) + + +def parse_args(): + """Parse arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + 'split_config', type=str, help='The split config for image slicing.') + parser.add_argument( + 'data_root', type=str, help='Root dir of DOTA dataset.') + parser.add_argument( + 'out_dir', type=str, help='Output dir for split result.') + parser.add_argument( + '--ann-subdir', + default='labelTxt-v1.0', + type=str, + help='output directory') + parser.add_argument( + '--phase', + '-p', + nargs='+', + default=['trainval', 'test'], + type=str, + choices=['trainval', 'train', 'val', 'test'], + help='Phase of the data set to be prepared.') + parser.add_argument( + '--nproc', default=8, type=int, help='Number of processes.') + parser.add_argument( + '--save-ext', + default=None, + type=str, + help='Extension of the saved image.') + parser.add_argument( + '--overwrite', + action='store_true', + help='Whether to allow overwrite if annotation folder exist.') + args = parser.parse_args() + + assert args.split_config is not None, "argument split_config can't be None" + split_cfg = Config.fromfile(args.split_config) + + # assert arguments + assert args.data_root is not None, "argument data_root can't be None" + if args.save_ext: + assert args.save_ext in ['png', 'jpg', 'bmp', 'tif'] + + assert len(split_cfg.patch_sizes) == len(split_cfg.patch_overlap_sizes) + assert 0 <= split_cfg.iof_thr <= 1 + if split_cfg.get('padding'): + padding_value = split_cfg.get('padding_value') + assert padding_value is not None, \ + "padding_value can't be None when padding is True." + padding_value = padding_value[0] \ + if len(padding_value) == 1 else padding_value + split_cfg.padding_value = padding_value + else: + split_cfg.padding = False + split_cfg.padding_value = None + return args, split_cfg + + +def _make_dirs(out_dir: str, phase: List[str], allow_overwrite: bool): + """Prepare folder for DOTA dataset. + + Args: + out_dir (str): The output dir for DOTA split. + phase (List[str]): The phase to prepare. + allow_overwrite (bool): Whether to allow overwrite when folder exist. + """ + logger = MMLogger.get_current_instance() + for p in phase: + phase_dir = osp.join(out_dir, p) + if not allow_overwrite: + assert not osp.exists(phase_dir), \ + f'{osp.join(phase_dir)} already exists,' \ + 'If you want to ignore existing files, set --overwrite' + else: + if osp.exists(phase_dir): + logger.warning( + f'{p} set in {osp.join(phase_dir)} will be overwritten') + mkdir_or_exist(phase_dir) + mkdir_or_exist(osp.join(phase_dir, 'images')) + mkdir_or_exist(osp.join(phase_dir, 'annfiles')) + + +def load_original_annotations(data_root: str, + ann_subdir: str = 'labelTxt-v1.0', + phase: str = 'train', + nproc: int = 8): + img_dir = osp.join(data_root, phase, 'images') + assert osp.isdir(img_dir), f'The {img_dir} is not an existing dir!' + + if phase == 'test': + ann_dir = None + else: + ann_dir = osp.join(data_root, phase, ann_subdir, 'labelTxt') + assert osp.isdir(ann_dir), f'The {ann_dir} is not an existing dir!' + + _load_func = partial(_load_dota_single, img_dir=img_dir, ann_dir=ann_dir) + if nproc > 1: + pool = Pool(nproc) + contents = pool.map(_load_func, os.listdir(img_dir)) + pool.close() + else: + contents = list(map(_load_func, os.listdir(img_dir))) + infos = [c for c in contents if c is not None] + return infos + + +def _load_dota_single(imgfile: str, img_dir: str, ann_dir: str): + """Load DOTA's single image. + + Args: + imgfile (str): Filename of single image. + img_dir (str): Path of images. + ann_dir (str): Path of annotations. + + Returns: + result (dict): Information of a single image. + + - ``id``: Image id. + - ``filename``: Filename of single image. + - ``filepath``: Filepath of single image. + - ``width``: The width of image. + - ``height``: The height of image. + - ``annotations``: The annotation of single image. + - ``gsd``: The ground sampling distance. + """ + img_id, ext = osp.splitext(imgfile) + if ext not in ['.jpg', '.JPG', '.png', '.tif', '.bmp']: + return None + + imgpath = osp.join(img_dir, imgfile) + size = Image.open(imgpath).size + txtfile = None if ann_dir is None else osp.join(ann_dir, img_id + '.txt') + content = _load_dota_txt(txtfile) + + content.update( + dict( + width=size[0], + height=size[1], + filename=imgfile, + filepath=imgpath, + id=img_id)) + return content + + +def _load_dota_txt(txtfile): + """Load DOTA's txt annotation. + + Args: + txtfile (str): Filename of single Dota txt annotation. + + Returns: + result (dict): Annotation of single image. + + - ``annotations``: The annotation of single image. + - ``gsd``: The ground sampling distance. + """ + gsd, bboxes, labels, diffs = None, [], [], [] + if txtfile is None: + pass + elif not osp.isfile(txtfile): + print(f"Can't find {txtfile}, treated as empty txtfile") + else: + with open(txtfile) as f: + for line in f: + if line.startswith('gsd'): + num = line.split(':')[-1] + try: + gsd = float(num) + except ValueError: + gsd = None + continue + + items = line.split(' ') + if len(items) >= 9: + bboxes.append([float(i) for i in items[:8]]) + labels.append(items[8]) + diffs.append(int(items[9]) if len(items) == 10 else 0) + + bboxes = np.array(bboxes, dtype=np.float32) if bboxes else \ + np.zeros((0, 8), dtype=np.float32) + diffs = np.array(diffs, dtype=np.int64) if diffs else \ + np.zeros((0,), dtype=np.int64) + ann = dict(bboxes=bboxes, labels=labels, diffs=diffs) + return dict(gsd=gsd, annotations=ann) + + +def poly2hbb(polys): + """Convert polygons to horizontal bboxes. + + Args: + polys (np.array): Polygons with shape (N, 8) + + Returns: + np.array: Horizontal bboxes. + """ + shape = polys.shape + polys = polys.reshape(*shape[:-1], shape[-1] // 2, 2) + lt_point = np.min(polys, axis=-2) + rb_point = np.max(polys, axis=-2) + return np.concatenate([lt_point, rb_point], axis=-1) + + +def get_sliding_window(info, patch_settings, img_rate_thr): + """Get sliding windows. + + Args: + info (dict): Dict of image's width and height. + patch_settings (list): List of patch settings, + each in format (patch_size, patch_overlap). + img_rate_thr (float): Threshold of window area divided by image area. + + Returns: + list[np.array]: Information of valid windows. + """ + eps = 0.01 + windows = [] + width, height = info['width'], info['height'] + for (size, gap) in patch_settings: + assert size > gap, f'invaild size gap pair [{size} {gap}]' + step = size - gap + + x_num = 1 if width <= size else ceil((width - size) / step + 1) + x_start = [step * i for i in range(x_num)] + if len(x_start) > 1 and x_start[-1] + size > width: + x_start[-1] = width - size + + y_num = 1 if height <= size else ceil((height - size) / step + 1) + y_start = [step * i for i in range(y_num)] + if len(y_start) > 1 and y_start[-1] + size > height: + y_start[-1] = height - size + + start = np.array( + list(itertools.product(x_start, y_start)), dtype=np.int64) + stop = start + size + windows.append(np.concatenate([start, stop], axis=1)) + windows = np.concatenate(windows, axis=0) + + img_in_wins = windows.copy() + img_in_wins[:, 0::2] = np.clip(img_in_wins[:, 0::2], 0, width) + img_in_wins[:, 1::2] = np.clip(img_in_wins[:, 1::2], 0, height) + img_areas = (img_in_wins[:, 2] - img_in_wins[:, 0]) * \ + (img_in_wins[:, 3] - img_in_wins[:, 1]) + win_areas = (windows[:, 2] - windows[:, 0]) * \ + (windows[:, 3] - windows[:, 1]) + img_rates = img_areas / win_areas + if not (img_rates > img_rate_thr).any(): + max_rate = img_rates.max() + img_rates[abs(img_rates - max_rate) < eps] = 1 + return windows[img_rates > img_rate_thr] + + +def get_window_annotation(info, windows, iof_thr): + """Get annotation by sliding windows. + + Args: + info (dict): Dict of bbox annotations. + windows (np.array): information of sliding windows. + iof_thr (float): Threshold of overlaps between bbox and window. + + Returns: + list[dict]: List of bbox annotations of every window. + """ + bboxes = info['annotations']['bboxes'] + iofs = ann_window_iof(bboxes, windows) + + window_anns = [] + for i in range(windows.shape[0]): + win_iofs = iofs[:, i] + pos_inds = np.nonzero(win_iofs >= iof_thr)[0].tolist() + + win_ann = dict() + for k, v in info['annotations'].items(): + try: + win_ann[k] = v[pos_inds] + except TypeError: + win_ann[k] = [v[i] for i in pos_inds] + win_ann['trunc'] = win_iofs[pos_inds] < 1 + window_anns.append(win_ann) + return window_anns + + +def ann_window_iof(anns, window, eps=1e-6): + """Compute overlaps (iof) between annotations (poly) and window (hbox). + + Args: + anns (np.array): quadri annotations with shape (n, 8). + window (np.array): slide windows with shape (m, 4). + eps (float, optional): Defaults to 1e-6. + + Returns: + np.array: iof between box and window. + """ + rows = anns.shape[0] + cols = window.shape[0] + + if rows * cols == 0: + return np.zeros((rows, cols), dtype=np.float32) + + hbboxes_ann = poly2hbb(anns) + hbboxes_win = window + hbboxes_ann = hbboxes_ann[:, None, :] + lt = np.maximum(hbboxes_ann[..., :2], hbboxes_win[..., :2]) + rb = np.minimum(hbboxes_ann[..., 2:], hbboxes_win[..., 2:]) + wh = np.clip(rb - lt, 0, np.inf) + h_overlaps = wh[..., 0] * wh[..., 1] + + l, t, r, b = (window[..., i] for i in range(4)) + polys_win = np.stack([l, t, r, t, r, b, l, b], axis=-1) + sg_polys_ann = [shgeo.Polygon(p) for p in anns.reshape(rows, -1, 2)] + sg_polys_win = [shgeo.Polygon(p) for p in polys_win.reshape(cols, -1, 2)] + overlaps = np.zeros(h_overlaps.shape) + for p in zip(*np.nonzero(h_overlaps)): + overlaps[p] = sg_polys_ann[p[0]].intersection(sg_polys_win[p[-1]]).area + unions = np.array([p.area for p in sg_polys_ann], dtype=np.float32) + unions = unions[..., None] + + unions = np.clip(unions, eps, np.inf) + outputs = overlaps / unions + if outputs.ndim == 1: + outputs = outputs[..., None] + return outputs + + +def crop_and_save_img(info, windows, window_anns, padding, padding_value, + save_dir, anno_dir, img_ext): + """Crop the image and save. + + Args: + info (dict): Image's information. + windows (np.array): information of sliding windows. + window_anns (list[dict]): List of bbox annotations of every window. + padding (bool): If True, with padding. + padding_value (tuple[int|float]): Padding value. + save_dir (str): Save filename. + anno_dir (str): Annotation filename. + img_ext (str): Picture suffix. + + Returns: + list[dict]: Information of paths. + """ + img = cv2.imread(info['filepath']) + patch_infos = [] + for window, ann in zip(windows, window_anns): + patch_info = dict() + for k, v in info.items(): + if k not in [ + 'id', 'filename', 'filepath', 'width', 'height', + 'annotations' + ]: + patch_info[k] = v + + x_start, y_start, x_stop, y_stop = window.tolist() + patch_info['x_start'] = x_start + patch_info['y_start'] = y_start + patch_info['id'] = \ + info['id'] + '__' + str(x_stop - x_start) + \ + '__' + str(x_start) + '___' + str(y_start) + patch_info['ori_id'] = info['id'] + + ann['bboxes'] = shift_qbboxes(ann['bboxes'], [-x_start, -y_start]) + patch_info['ann'] = ann + + patch = img[y_start:y_stop, x_start:x_stop] + if padding: + height = y_stop - y_start + width = x_stop - x_start + if height > patch.shape[0] or width > patch.shape[1]: + padding_patch = np.empty((height, width, patch.shape[-1]), + dtype=np.uint8) + if not isinstance(padding_value, (int, float)): + assert len(padding_value) == patch.shape[-1] + padding_patch[...] = padding_value + padding_patch[:patch.shape[0], :patch.shape[1], ...] = patch + patch = padding_patch + patch_info['height'] = patch.shape[0] + patch_info['width'] = patch.shape[1] + + cv2.imwrite( + osp.join(save_dir, patch_info['id'] + '.' + img_ext), patch) + patch_info['filename'] = patch_info['id'] + '.' + img_ext + patch_infos.append(patch_info) + + bboxes_num = patch_info['ann']['bboxes'].shape[0] + outdir = os.path.join(anno_dir, patch_info['id'] + '.txt') + + with codecs.open(outdir, 'w', 'utf-8') as f_out: + if bboxes_num == 0: + pass + else: + for idx in range(bboxes_num): + obj = patch_info['ann'] + outline = ' '.join(list(map(str, obj['bboxes'][idx]))) + diffs = str( + obj['diffs'][idx]) if not obj['trunc'][idx] else '2' + outline = outline + ' ' + obj['labels'][idx] + ' ' + diffs + f_out.write(outline + '\n') + + return patch_infos + + +def shift_qbboxes(bboxes, offset: Sequence[float]): + """Map bboxes from window coordinate back to original coordinate. TODO + Refactor and move to `mmyolo/utils/large_image.py` + + Args: + bboxes (np.array): quadrilateral boxes with window coordinate. + offset (Sequence[float]): The translation offsets with shape of (2, ). + + Returns: + np.array: bboxes with original coordinate. + """ + dim = bboxes.shape[-1] + translated = bboxes + np.array(offset * int(dim / 2), dtype=np.float32) + return translated + + +def single_split(info, patch_settings, min_img_ratio, iof_thr, padding, + padding_value, save_dir, anno_dir, img_ext, lock, prog, + total): + """Single image split. TODO Refactoring to make it more generic. + + Args: + info (dict): Image info and annotations. + patch_settings (list): List of patch settings, + each in format (patch_size, patch_overlap). + min_img_ratio (float): Threshold of window area divided by image area. + iof_thr (float): Threshold of overlaps between bbox and window. + padding (bool): If True, with padding. + padding_value (tuple[int|float]): Padding value. + save_dir (str): Save filename. + anno_dir (str): Annotation filename. + img_ext (str): Picture suffix. + lock (Lock): Lock of Manager. + prog (object): Progress of Manager. + total (int): Length of infos. + + Returns: + list[dict]: Information of paths. + """ + img_ext = img_ext if img_ext is not None else info['filename'].split( + '.')[-1] + windows = get_sliding_window(info, patch_settings, min_img_ratio) + window_anns = get_window_annotation(info, windows, iof_thr) + patch_infos = crop_and_save_img(info, windows, window_anns, padding, + padding_value, save_dir, anno_dir, img_ext) + assert patch_infos + + lock.acquire() + prog.value += 1 + msg = f'({prog.value / total:3.1%} {prog.value}:{total})' + msg += ' - ' + f"Filename: {info['filename']}" + msg += ' - ' + f"width: {info['width']:<5d}" + msg += ' - ' + f"height: {info['height']:<5d}" + msg += ' - ' + f"Objects: {len(info['annotations']['bboxes']):<5d}" + msg += ' - ' + f'Patches: {len(patch_infos)}' + print_log(msg, 'current') + lock.release() + + return patch_infos + + +def main(): + args, split_cfg = parse_args() + + mkdir_or_exist(args.out_dir) + + # init logger + log_file_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log' + logger: MMLogger = MMLogger.get_instance( + 'mmyolo', + log_file=osp.join(args.out_dir, log_file_name), + log_level='INFO') + + # print configs + arg_str = '' + for arg in args._get_kwargs(): + arg_str += arg[0] + ' = ' + str(arg[1]) + '\n' + + logger.info('Base Settings:\n' + arg_str) + logger.info('Split Settings:\n' + split_cfg.pretty_text) + + # make dirs + _make_dirs(args.out_dir, args.phase, args.overwrite) + + # Load original dota data + required_sets = [] + for p in args.phase: + required_sets.extend(PHASE_REQUIRE_SETS[p]) + required_sets = set(required_sets) + + loaded_data_set = dict() + for req_set in required_sets: + logger.info(f'Starting loading DOTA {req_set} set information.') + start_time = time.time() + + infos = load_original_annotations( + data_root=args.data_root, + ann_subdir=args.ann_subdir, + phase=req_set) + + end_time = time.time() + result_log = f'Finishing loading {req_set} set, ' + result_log += f'get {len(infos)} images, ' + result_log += f'using {end_time - start_time:.3f}s.' + logger.info(result_log) + + loaded_data_set[req_set] = infos + + # Preprocess patch settings + patch_settings = [] + for ratio in split_cfg.img_resize_ratio: + for size, gap in zip(split_cfg.patch_sizes, + split_cfg.patch_overlap_sizes): + size_gap = (int(size / ratio), int(gap / ratio)) + if size_gap not in patch_settings: + patch_settings.append(size_gap) + + # Split data + for p in args.phase: + save_imgs_dir = osp.join(args.out_dir, p, 'images') + save_anns_dir = osp.join(args.out_dir, p, 'annfiles') + + logger.info(f'Start splitting {p} set images!') + start = time.time() + manager = Manager() + + data_infos = [] + for req_set in PHASE_REQUIRE_SETS[p]: + data_infos.extend(loaded_data_set[req_set]) + + worker = partial( + single_split, + patch_settings=patch_settings, + min_img_ratio=split_cfg.min_img_ratio, + iof_thr=split_cfg.iof_thr, + padding=split_cfg.padding, + padding_value=split_cfg.padding_value, + save_dir=save_imgs_dir, + anno_dir=save_anns_dir, + img_ext=args.save_ext, + lock=manager.Lock(), + prog=manager.Value('i', 0), + total=len(data_infos)) + + if args.nproc > 1: + pool = Pool(args.nproc) + patch_infos = pool.map(worker, data_infos) + pool.close() + else: + patch_infos = list(map(worker, data_infos)) + + patch_infos = reduce(lambda x, y: x + y, patch_infos) + stop = time.time() + logger.info( + f'Finish splitting {p} set images in {int(stop - start)} second!!!' + ) + logger.info(f'Total images number: {len(patch_infos)}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/split_config/multi_scale.json b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/split_config/multi_scale.json new file mode 100644 index 0000000000000000000000000000000000000000..8cbdc93a4420abec7298f188a01ee71f38b94eb8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/split_config/multi_scale.json @@ -0,0 +1,19 @@ +{ + "patch_sizes": [ + 1024 + ], + "patch_overlap_sizes": [ + 500 + ], + "img_resize_ratio": [ + 0.5, 1.0, 1.5 + ], + "min_img_ratio": 0.6, + "iof_thr": 0.7, + "padding": true, + "padding_value": [ + 104, + 116, + 124 + ] +} diff --git a/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/split_config/single_scale.json b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/split_config/single_scale.json new file mode 100644 index 0000000000000000000000000000000000000000..8c65c40ad63d522b3ab82956f6a7befdef874818 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/dota/split_config/single_scale.json @@ -0,0 +1,19 @@ +{ + "patch_sizes": [ + 1024 + ], + "patch_overlap_sizes": [ + 200 + ], + "img_resize_ratio": [ + 1.0 + ], + "min_img_ratio": 0.6, + "iof_thr": 0.7, + "padding": true, + "padding_value": [ + 104, + 116, + 124 + ] +} diff --git a/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/labelme2coco.py b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/labelme2coco.py new file mode 100644 index 0000000000000000000000000000000000000000..e68b935db3236177d4c17973ef2a43159150ffc7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/labelme2coco.py @@ -0,0 +1,325 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This script helps to convert labelme-style dataset to the coco format. + +Usage: + $ python labelme2coco.py \ + --img-dir /path/to/images \ + --labels-dir /path/to/labels \ + --out /path/to/coco_instances.json \ + [--class-id-txt /path/to/class_with_id.txt] + +Note: + Labels dir file structure: + . + └── PATH_TO_LABELS + ├── image1.json + ├── image2.json + └── ... + + Images dir file structure: + . + └── PATH_TO_IMAGES + ├── image1.jpg + ├── image2.png + └── ... + + If user set `--class-id-txt` then will use it in `categories` field, + if not set, then will generate auto base on the all labelme label + files to `class_with_id.json`. + + class_with_id.txt example, each line is "id class_name": + ```text + 1 cat + 2 dog + 3 bicycle + 4 motorcycle + + ``` +""" +import argparse +import json +from pathlib import Path +from typing import Optional + +import numpy as np +from mmengine import track_iter_progress + +from mmyolo.utils.misc import IMG_EXTENSIONS + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--img-dir', type=str, help='Dataset image directory') + parser.add_argument( + '--labels-dir', type=str, help='Dataset labels directory') + parser.add_argument('--out', type=str, help='COCO label json output path') + parser.add_argument( + '--class-id-txt', default=None, type=str, help='All class id txt path') + args = parser.parse_args() + return args + + +def format_coco_annotations(points: list, image_id: int, annotations_id: int, + category_id: int) -> dict: + """Gen COCO annotations format label from labelme format label. + + Args: + points (list): Coordinates of four vertices of rectangle bbox. + image_id (int): Image id. + annotations_id (int): Annotations id. + category_id (int): Image dir path. + + Return: + annotation_info (dict): COCO annotation data. + """ + annotation_info = dict() + annotation_info['iscrowd'] = 0 + annotation_info['category_id'] = category_id + annotation_info['id'] = annotations_id + annotation_info['image_id'] = image_id + + # bbox is [x1, y1, w, h] + annotation_info['bbox'] = [ + points[0][0], points[0][1], points[1][0] - points[0][0], + points[1][1] - points[0][1] + ] + + annotation_info['area'] = annotation_info['bbox'][2] * annotation_info[ + 'bbox'][3] # bbox w * h + segmentation_points = np.asarray(points).copy() + segmentation_points[1, :] = np.asarray(points)[2, :] + segmentation_points[2, :] = np.asarray(points)[1, :] + annotation_info['segmentation'] = [list(segmentation_points.flatten())] + + return annotation_info + + +def parse_labelme_to_coco( + image_dir: str, + labels_root: str, + all_classes_id: Optional[dict] = None) -> (dict, dict): + """Gen COCO json format label from labelme format label. + + Args: + image_dir (str): Image dir path. + labels_root (str): Image label root path. + all_classes_id (Optional[dict]): All class with id. Default None. + + Return: + coco_json (dict): COCO json data. + category_to_id (dict): category id and name. + + COCO json example: + + { + "images": [ + { + "height": 3000, + "width": 4000, + "id": 1, + "file_name": "IMG_20210627_225110.jpg" + }, + ... + ], + "categories": [ + { + "id": 1, + "name": "cat" + }, + ... + ], + "annotations": [ + { + "iscrowd": 0, + "category_id": 1, + "id": 1, + "image_id": 1, + "bbox": [ + 1183.7313232421875, + 1230.0509033203125, + 1270.9998779296875, + 927.0848388671875 + ], + "area": 1178324.7170306593, + "segmentation": [ + [ + 1183.7313232421875, + 1230.0509033203125, + 1183.7313232421875, + 2157.1357421875, + 2454.731201171875, + 2157.1357421875, + 2454.731201171875, + 1230.0509033203125 + ] + ] + }, + ... + ] + } + """ + + # init coco json field + coco_json = {'images': [], 'categories': [], 'annotations': []} + + image_id = 0 + annotations_id = 0 + if all_classes_id is None: + category_to_id = dict() + categories_labels = [] + else: + category_to_id = all_classes_id + categories_labels = list(all_classes_id.keys()) + + # add class_ids and class_names to the categories list in coco_json + for class_name, class_id in category_to_id.items(): + coco_json['categories'].append({ + 'id': class_id, + 'name': class_name + }) + + # filter incorrect image file + img_file_list = [ + img_file for img_file in Path(image_dir).iterdir() + if img_file.suffix.lower() in IMG_EXTENSIONS + ] + + for img_file in track_iter_progress(img_file_list): + + # get label file according to the image file name + label_path = Path(labels_root).joinpath( + img_file.stem).with_suffix('.json') + if not label_path.exists(): + print(f'Can not find label file: {label_path}, skip...') + continue + + # load labelme label + with open(label_path, encoding='utf-8') as f: + labelme_data = json.load(f) + + image_id = image_id + 1 # coco id begin from 1 + + # update coco 'images' field + coco_json['images'].append({ + 'height': + labelme_data['imageHeight'], + 'width': + labelme_data['imageWidth'], + 'id': + image_id, + 'file_name': + Path(labelme_data['imagePath']).name + }) + + for label_shapes in labelme_data['shapes']: + + # Update coco 'categories' field + class_name = label_shapes['label'] + + if (all_classes_id is None) and (class_name + not in categories_labels): + # only update when not been added before + coco_json['categories'].append({ + 'id': + len(categories_labels) + 1, # categories id start with 1 + 'name': class_name + }) + categories_labels.append(class_name) + category_to_id[class_name] = len(categories_labels) + + elif (all_classes_id is not None) and (class_name + not in categories_labels): + # check class name + raise ValueError(f'Got unexpected class name {class_name}, ' + 'which is not in your `--class-id-txt`.') + + # get shape type and convert it to coco format + shape_type = label_shapes['shape_type'] + if shape_type != 'rectangle': + print(f'not support `{shape_type}` yet, skip...') + continue + + annotations_id = annotations_id + 1 + # convert point from [xmin, ymin, xmax, ymax] to [x1, y1, w, h] + (x1, y1), (x2, y2) = label_shapes['points'] + x1, x2 = sorted([x1, x2]) # xmin, xmax + y1, y2 = sorted([y1, y2]) # ymin, ymax + points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]] + coco_annotations = format_coco_annotations( + points, image_id, annotations_id, category_to_id[class_name]) + coco_json['annotations'].append(coco_annotations) + + print(f'Total image = {image_id}') + print(f'Total annotations = {annotations_id}') + print(f'Number of categories = {len(categories_labels)}, ' + f'which is {categories_labels}') + + return coco_json, category_to_id + + +def convert_labelme_to_coco(image_dir: str, + labels_dir: str, + out_path: str, + class_id_txt: Optional[str] = None): + """Convert labelme format label to COCO json format label. + + Args: + image_dir (str): Image dir path. + labels_dir (str): Image label path. + out_path (str): COCO json file save path. + class_id_txt (Optional[str]): All class id txt file path. + Default None. + """ + assert Path(out_path).suffix == '.json' + + if class_id_txt is not None: + assert Path(class_id_txt).suffix == '.txt' + + all_classes_id = dict() + with open(class_id_txt, encoding='utf-8') as f: + txt_lines = f.read().splitlines() + assert len(txt_lines) > 0 + + for txt_line in txt_lines: + class_info = txt_line.split(' ') + if len(class_info) != 2: + raise ValueError('Error parse "class_id_txt" file ' + f'{class_id_txt}, please check if some of ' + 'the class names is blank, like "1 " -> ' + '"1 blank", or class name has space between' + ' words, like "1 Big house" -> "1 ' + 'Big-house".') + v, k = class_info + all_classes_id.update({k: int(v)}) + else: + all_classes_id = None + + # convert to coco json + coco_json_data, category_to_id = parse_labelme_to_coco( + image_dir, labels_dir, all_classes_id) + + # save json result + Path(out_path).parent.mkdir(exist_ok=True, parents=True) + print(f'Saving json to {out_path}') + json.dump(coco_json_data, open(out_path, 'w'), indent=2) + + if class_id_txt is None: + category_to_id_path = Path(out_path).with_name('class_with_id.txt') + print(f'Saving class id txt to {category_to_id_path}') + with open(category_to_id_path, 'w', encoding='utf-8') as f: + for k, v in category_to_id.items(): + f.write(f'{v} {k}\n') + else: + print('Not Saving new class id txt, user should using ' + f'{class_id_txt} for training config') + + +def main(): + args = parse_args() + convert_labelme_to_coco(args.img_dir, args.labels_dir, args.out, + args.class_id_txt) + print('All done!') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/yolo2coco.py b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/yolo2coco.py new file mode 100644 index 0000000000000000000000000000000000000000..19f1366622a3305f001e6e6650ad31f98c54b7c7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/dataset_converters/yolo2coco.py @@ -0,0 +1,264 @@ +"""This script helps to convert yolo-style dataset to the coco format. + +Usage: + $ python yolo2coco.py /path/to/dataset # image_dir + +Note: + 1. Before running this script, please make sure the root directory + of your dataset is formatted in the following struction: + . + └── $ROOT_PATH + ├── classes.txt + ├── labels + │ ├── a.txt + │ ├── b.txt + │ └── ... + ├── images + │ ├── a.jpg + │ ├── b.png + │ └── ... + └── ... + 2. The script will automatically check whether the corresponding + `train.txt`, ` val.txt`, and `test.txt` exist under your `image_dir` + or not. If these files are detected, the script will organize the + dataset. The image paths in these files must be ABSOLUTE paths. + 3. Once the script finishes, the result files will be saved in the + directory named 'annotations' in the root directory of your dataset. + The default output file is result.json. The root directory folder may + look like this in the root directory after the converting: + . + └── $ROOT_PATH + ├── annotations + │ ├── result.json + │ └── ... + ├── classes.txt + ├── labels + │ ├── a.txt + │ ├── b.txt + │ └── ... + ├── images + │ ├── a.jpg + │ ├── b.png + │ └── ... + └── ... + 4. After converting to coco, you can use the + `tools/analysis_tools/browse_coco_json.py` script to visualize + whether it is correct. +""" +import argparse +import os +import os.path as osp + +import mmcv +import mmengine + +IMG_EXTENSIONS = ('.jpg', '.png', '.jpeg') + + +def check_existence(file_path: str): + """Check if target file is existed.""" + if not osp.exists(file_path): + raise FileNotFoundError(f'{file_path} does not exist!') + + +def get_image_info(yolo_image_dir, idx, file_name): + """Retrieve image information.""" + img_path = osp.join(yolo_image_dir, file_name) + check_existence(img_path) + + img = mmcv.imread(img_path) + height, width = img.shape[:2] + img_info_dict = { + 'file_name': file_name, + 'id': idx, + 'width': width, + 'height': height + } + return img_info_dict, height, width + + +def convert_bbox_info(label, idx, obj_count, image_height, image_width): + """Convert yolo-style bbox info to the coco format.""" + label = label.strip().split() + x = float(label[1]) + y = float(label[2]) + w = float(label[3]) + h = float(label[4]) + + # convert x,y,w,h to x1,y1,x2,y2 + x1 = (x - w / 2) * image_width + y1 = (y - h / 2) * image_height + x2 = (x + w / 2) * image_width + y2 = (y + h / 2) * image_height + + cls_id = int(label[0]) + width = max(0., x2 - x1) + height = max(0., y2 - y1) + coco_format_info = { + 'image_id': idx, + 'id': obj_count, + 'category_id': cls_id, + 'bbox': [x1, y1, width, height], + 'area': width * height, + 'segmentation': [[x1, y1, x2, y1, x2, y2, x1, y2]], + 'iscrowd': 0 + } + obj_count += 1 + return coco_format_info, obj_count + + +def organize_by_existing_files(image_dir: str, existed_categories: list): + """Format annotations by existing train/val/test files.""" + categories = ['train', 'val', 'test'] + image_list = [] + + for cat in categories: + if cat in existed_categories: + txt_file = osp.join(image_dir, f'{cat}.txt') + print(f'Start to read {cat} dataset definition') + assert osp.exists(txt_file) + + with open(txt_file) as f: + img_paths = f.readlines() + img_paths = [ + os.path.split(img_path.strip())[1] + for img_path in img_paths + ] # split the absolute path + image_list.append(img_paths) + else: + image_list.append([]) + return image_list[0], image_list[1], image_list[2] + + +def convert_yolo_to_coco(image_dir: str): + """Convert annotations from yolo style to coco style. + + Args: + image_dir (str): the root directory of your datasets which contains + labels, images, classes.txt, etc + """ + print(f'Start to load existing images and annotations from {image_dir}') + check_existence(image_dir) + + # check local environment + yolo_label_dir = osp.join(image_dir, 'labels') + yolo_image_dir = osp.join(image_dir, 'images') + yolo_class_txt = osp.join(image_dir, 'classes.txt') + check_existence(yolo_label_dir) + check_existence(yolo_image_dir) + check_existence(yolo_class_txt) + print(f'All necessary files are located at {image_dir}') + + train_txt_path = osp.join(image_dir, 'train.txt') + val_txt_path = osp.join(image_dir, 'val.txt') + test_txt_path = osp.join(image_dir, 'test.txt') + existed_categories = [] + print(f'Checking if train.txt, val.txt, and test.txt are in {image_dir}') + if osp.exists(train_txt_path): + print('Found train.txt') + existed_categories.append('train') + if osp.exists(val_txt_path): + print('Found val.txt') + existed_categories.append('val') + if osp.exists(test_txt_path): + print('Found test.txt') + existed_categories.append('test') + + # prepare the output folders + output_folder = osp.join(image_dir, 'annotations') + if not osp.exists(output_folder): + os.makedirs(output_folder) + check_existence(output_folder) + + # start the convert procedure + with open(yolo_class_txt) as f: + classes = f.read().strip().split() + + indices = os.listdir(yolo_image_dir) + total = len(indices) + + dataset = {'images': [], 'annotations': [], 'categories': []} + if existed_categories == []: + print('These files are not located, no need to organize separately.') + for i, cls in enumerate(classes, 0): + dataset['categories'].append({'id': i, 'name': cls}) + else: + print('Need to organize the data accordingly.') + train_dataset = {'images': [], 'annotations': [], 'categories': []} + val_dataset = {'images': [], 'annotations': [], 'categories': []} + test_dataset = {'images': [], 'annotations': [], 'categories': []} + + # category id starts from 0 + for i, cls in enumerate(classes, 0): + train_dataset['categories'].append({'id': i, 'name': cls}) + val_dataset['categories'].append({'id': i, 'name': cls}) + test_dataset['categories'].append({'id': i, 'name': cls}) + train_img, val_img, test_img = organize_by_existing_files( + image_dir, existed_categories) + + obj_count = 0 + skipped = 0 + converted = 0 + for idx, image in enumerate(mmengine.track_iter_progress(indices)): + img_info_dict, image_height, image_width = get_image_info( + yolo_image_dir, idx, image) + + if existed_categories != []: + if image in train_img: + dataset = train_dataset + elif image in val_img: + dataset = val_dataset + elif image in test_img: + dataset = test_dataset + + dataset['images'].append(img_info_dict) + + img_name = osp.splitext(image)[0] + label_path = f'{osp.join(yolo_label_dir, img_name)}.txt' + if not osp.exists(label_path): + # if current image is not annotated or the annotation file failed + print( + f'WARNING: {label_path} does not exist. Please check the file.' + ) + skipped += 1 + continue + + with open(label_path) as f: + labels = f.readlines() + for label in labels: + coco_info, obj_count = convert_bbox_info( + label, idx, obj_count, image_height, image_width) + dataset['annotations'].append(coco_info) + converted += 1 + + # saving results to result json + if existed_categories == []: + out_file = osp.join(image_dir, 'annotations/result.json') + print(f'Saving converted results to {out_file} ...') + mmengine.dump(dataset, out_file) + else: + for category in existed_categories: + out_file = osp.join(output_folder, f'{category}.json') + print(f'Saving converted results to {out_file} ...') + if category == 'train': + mmengine.dump(train_dataset, out_file) + elif category == 'val': + mmengine.dump(val_dataset, out_file) + elif category == 'test': + mmengine.dump(test_dataset, out_file) + + # simple statistics + print(f'Process finished! Please check at {output_folder} .') + print(f'Number of images found: {total}, converted: {converted},', + f'and skipped: {skipped}. Total annotation count: {obj_count}.') + print('You can use tools/analysis_tools/browse_coco_json.py to visualize!') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + 'image_dir', + type=str, + help='dataset directory with ./images and ./labels, classes.txt, etc.') + arg = parser.parse_args() + convert_yolo_to_coco(arg.image_dir) diff --git a/models/YOLO-World/third_party/mmyolo/tools/dist_test.sh b/models/YOLO-World/third_party/mmyolo/tools/dist_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..dea131b43ea8f1222661d20603d40c18ea7f28a1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/dist_test.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/test.py \ + $CONFIG \ + $CHECKPOINT \ + --launcher pytorch \ + ${@:4} diff --git a/models/YOLO-World/third_party/mmyolo/tools/dist_train.sh b/models/YOLO-World/third_party/mmyolo/tools/dist_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..3fca7641dec4090930c85991a079c28409529d4e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/dist_train.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +CONFIG=$1 +GPUS=$2 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/train.py \ + $CONFIG \ + --launcher pytorch ${@:3} diff --git a/models/YOLO-World/third_party/mmyolo/tools/misc/coco_split.py b/models/YOLO-World/third_party/mmyolo/tools/misc/coco_split.py new file mode 100644 index 0000000000000000000000000000000000000000..8ce70349b6e85f48704e6ef5c8e5c0164bc6084e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/misc/coco_split.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +import random +from pathlib import Path + +import numpy as np +from pycocotools.coco import COCO + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--json', type=str, required=True, help='COCO json label path') + parser.add_argument( + '--out-dir', type=str, required=True, help='output path') + parser.add_argument( + '--ratios', + nargs='+', + type=float, + help='ratio for sub dataset, if set 2 number then will generate ' + 'trainval + test (eg. "0.8 0.1 0.1" or "2 1 1"), if set 3 number ' + 'then will generate train + val + test (eg. "0.85 0.15" or "2 1")') + parser.add_argument( + '--shuffle', + action='store_true', + help='Whether to display in disorder') + parser.add_argument('--seed', default=-1, type=int, help='seed') + args = parser.parse_args() + return args + + +def split_coco_dataset(coco_json_path: str, save_dir: str, ratios: list, + shuffle: bool, seed: int): + if not Path(coco_json_path).exists(): + raise FileNotFoundError(f'Can not not found {coco_json_path}') + + if not Path(save_dir).exists(): + Path(save_dir).mkdir(parents=True) + + # ratio normalize + ratios = np.array(ratios) / np.array(ratios).sum() + + if len(ratios) == 2: + ratio_train, ratio_test = ratios + ratio_val = 0 + train_type = 'trainval' + elif len(ratios) == 3: + ratio_train, ratio_val, ratio_test = ratios + train_type = 'train' + else: + raise ValueError('ratios must set 2 or 3 group!') + + # Read coco info + coco = COCO(coco_json_path) + coco_image_ids = coco.getImgIds() + + # gen image number of each dataset + val_image_num = int(len(coco_image_ids) * ratio_val) + test_image_num = int(len(coco_image_ids) * ratio_test) + train_image_num = len(coco_image_ids) - val_image_num - test_image_num + print('Split info: ====== \n' + f'Train ratio = {ratio_train}, number = {train_image_num}\n' + f'Val ratio = {ratio_val}, number = {val_image_num}\n' + f'Test ratio = {ratio_test}, number = {test_image_num}') + + seed = int(seed) + if seed != -1: + print(f'Set the global seed: {seed}') + np.random.seed(seed) + + if shuffle: + print('shuffle dataset.') + random.shuffle(coco_image_ids) + + # split each dataset + train_image_ids = coco_image_ids[:train_image_num] + if val_image_num != 0: + val_image_ids = coco_image_ids[train_image_num:train_image_num + + val_image_num] + else: + val_image_ids = None + test_image_ids = coco_image_ids[train_image_num + val_image_num:] + + # Save new json + categories = coco.loadCats(coco.getCatIds()) + for img_id_list in [train_image_ids, val_image_ids, test_image_ids]: + if img_id_list is None: + continue + + # Gen new json + img_dict = { + 'images': coco.loadImgs(ids=img_id_list), + 'categories': categories, + 'annotations': coco.loadAnns(coco.getAnnIds(imgIds=img_id_list)) + } + + # save json + if img_id_list == train_image_ids: + json_file_path = Path(save_dir, f'{train_type}.json') + elif img_id_list == val_image_ids: + json_file_path = Path(save_dir, 'val.json') + elif img_id_list == test_image_ids: + json_file_path = Path(save_dir, 'test.json') + else: + raise ValueError('img_id_list ERROR!') + + print(f'Saving json to {json_file_path}') + with open(json_file_path, 'w') as f_json: + json.dump(img_dict, f_json, ensure_ascii=False, indent=2) + + print('All done!') + + +def main(): + args = parse_args() + split_coco_dataset(args.json, args.out_dir, args.ratios, args.shuffle, + args.seed) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/misc/download_dataset.py b/models/YOLO-World/third_party/mmyolo/tools/misc/download_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7d1c64d82ec21285c348afc65a102d49452f2d4a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/misc/download_dataset.py @@ -0,0 +1,112 @@ +import argparse +from itertools import repeat +from multiprocessing.pool import ThreadPool +from pathlib import Path +from tarfile import TarFile +from zipfile import ZipFile + +import torch + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Download datasets for training') + parser.add_argument( + '--dataset-name', type=str, help='dataset name', default='coco2017') + parser.add_argument( + '--save-dir', + type=str, + help='the dir to save dataset', + default='data/coco') + parser.add_argument( + '--unzip', + action='store_true', + help='whether unzip dataset or not, zipped files will be saved') + parser.add_argument( + '--delete', + action='store_true', + help='delete the download zipped files') + parser.add_argument( + '--threads', type=int, help='number of threading', default=4) + args = parser.parse_args() + return args + + +def download(url, dir, unzip=True, delete=False, threads=1): + + def download_one(url, dir): + f = dir / Path(url).name + if Path(url).is_file(): + Path(url).rename(f) + elif not f.exists(): + print(f'Downloading {url} to {f}') + torch.hub.download_url_to_file(url, f, progress=True) + if unzip and f.suffix in ('.zip', '.tar'): + print(f'Unzipping {f.name}') + if f.suffix == '.zip': + ZipFile(f).extractall(path=dir) + elif f.suffix == '.tar': + TarFile(f).extractall(path=dir) + if delete: + f.unlink() + print(f'Delete {f}') + + dir = Path(dir) + if threads > 1: + pool = ThreadPool(threads) + pool.imap(lambda x: download_one(*x), zip(url, repeat(dir))) + pool.close() + pool.join() + else: + for u in [url] if isinstance(url, (str, Path)) else url: + download_one(u, dir) + + +def main(): + args = parse_args() + path = Path(args.save_dir) + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + data2url = dict( + # TODO: Support for downloading Panoptic Segmentation of COCO + coco2017=[ + 'http://images.cocodataset.org/zips/train2017.zip', + 'http://images.cocodataset.org/zips/val2017.zip', + 'http://images.cocodataset.org/zips/test2017.zip', + 'http://images.cocodataset.org/annotations/' + + 'annotations_trainval2017.zip' + ], + lvis=[ + 'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip', # noqa + 'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip', # noqa + ], + voc2007=[ + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar', # noqa + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', # noqa + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar', # noqa + ], + voc2012=[ + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar', # noqa + ], + balloon=[ + # src link: https://github.com/matterport/Mask_RCNN/releases/download/v2.1/balloon_dataset.zip # noqa + 'https://download.openmmlab.com/mmyolo/data/balloon_dataset.zip' + ], + cat=[ + 'https://download.openmmlab.com/mmyolo/data/cat_dataset.zip' # noqa + ], + ) + url = data2url.get(args.dataset_name, None) + if url is None: + print('Only support COCO, VOC, balloon, cat and LVIS now!') + return + download( + url, + dir=path, + unzip=args.unzip, + delete=args.delete, + threads=args.threads) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/misc/extract_subcoco.py b/models/YOLO-World/third_party/mmyolo/tools/misc/extract_subcoco.py new file mode 100644 index 0000000000000000000000000000000000000000..31528e0b338bf26bdf5abbca0e2254413e87e186 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/misc/extract_subcoco.py @@ -0,0 +1,160 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Extracting subsets from coco2017 dataset. + +This script is mainly used to debug and verify the correctness of the +program quickly. +The root folder format must be in the following format: + +├── root +│ ├── annotations +│ ├── train2017 +│ ├── val2017 +│ ├── test2017 + +Currently, only support COCO2017. In the future will support user-defined +datasets of standard coco JSON format. + +Example: + python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img ${NUM_IMG} +""" + +import argparse +import os.path as osp +import shutil + +import mmengine +import numpy as np +from pycocotools.coco import COCO + + +# TODO: Currently only supports coco2017 +def _process_data(args, + in_dataset_type: str, + out_dataset_type: str, + year: str = '2017'): + assert in_dataset_type in ('train', 'val') + assert out_dataset_type in ('train', 'val') + + int_ann_file_name = f'annotations/instances_{in_dataset_type}{year}.json' + out_ann_file_name = f'annotations/instances_{out_dataset_type}{year}.json' + + ann_path = osp.join(args.root, int_ann_file_name) + json_data = mmengine.load(ann_path) + + new_json_data = { + 'info': json_data['info'], + 'licenses': json_data['licenses'], + 'categories': json_data['categories'], + 'images': [], + 'annotations': [] + } + + area_dict = { + 'small': [0., 32 * 32], + 'medium': [32 * 32, 96 * 96], + 'large': [96 * 96, float('inf')] + } + + coco = COCO(ann_path) + + # filter annotations by category ids and area range + areaRng = area_dict[args.area_size] if args.area_size else [] + catIds = coco.getCatIds(args.classes) if args.classes else [] + ann_ids = coco.getAnnIds(catIds=catIds, areaRng=areaRng) + ann_info = coco.loadAnns(ann_ids) + + # get image ids by anns set + filter_img_ids = {ann['image_id'] for ann in ann_info} + filter_img = coco.loadImgs(filter_img_ids) + + # shuffle + np.random.shuffle(filter_img) + + num_img = args.num_img if args.num_img > 0 else len(filter_img) + if num_img > len(filter_img): + print( + f'num_img is too big, will be set to {len(filter_img)}, ' + 'because of not enough image after filter by classes and area_size' + ) + num_img = len(filter_img) + + progress_bar = mmengine.ProgressBar(num_img) + + for i in range(num_img): + file_name = filter_img[i]['file_name'] + image_path = osp.join(args.root, in_dataset_type + year, file_name) + + ann_ids = coco.getAnnIds( + imgIds=[filter_img[i]['id']], catIds=catIds, areaRng=areaRng) + img_ann_info = coco.loadAnns(ann_ids) + + new_json_data['images'].append(filter_img[i]) + new_json_data['annotations'].extend(img_ann_info) + + shutil.copy(image_path, osp.join(args.out_dir, + out_dataset_type + year)) + + progress_bar.update() + + mmengine.dump(new_json_data, osp.join(args.out_dir, out_ann_file_name)) + + +def _make_dirs(out_dir): + mmengine.mkdir_or_exist(out_dir) + mmengine.mkdir_or_exist(osp.join(out_dir, 'annotations')) + mmengine.mkdir_or_exist(osp.join(out_dir, 'train2017')) + mmengine.mkdir_or_exist(osp.join(out_dir, 'val2017')) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Extract coco subset') + parser.add_argument('root', help='root path') + parser.add_argument( + 'out_dir', type=str, help='directory where subset coco will be saved.') + parser.add_argument( + '--num-img', + default=50, + type=int, + help='num of extract image, -1 means all images') + parser.add_argument( + '--area-size', + choices=['small', 'medium', 'large'], + help='filter ground-truth info by area size') + parser.add_argument( + '--classes', nargs='+', help='filter ground-truth by class name') + parser.add_argument( + '--use-training-set', + action='store_true', + help='Whether to use the training set when extract the training set. ' + 'The training subset is extracted from the validation set by ' + 'default which can speed up.') + parser.add_argument('--seed', default=-1, type=int, help='seed') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + assert args.out_dir != args.root, \ + 'The file will be overwritten in place, ' \ + 'so the same folder is not allowed !' + + seed = int(args.seed) + if seed != -1: + print(f'Set the global seed: {seed}') + np.random.seed(int(args.seed)) + + _make_dirs(args.out_dir) + + print('====Start processing train dataset====') + if args.use_training_set: + _process_data(args, 'train', 'train') + else: + _process_data(args, 'val', 'train') + print('\n====Start processing val dataset====') + _process_data(args, 'val', 'val') + print(f'\n Result save to {args.out_dir}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/misc/print_config.py b/models/YOLO-World/third_party/mmyolo/tools/misc/print_config.py new file mode 100644 index 0000000000000000000000000000000000000000..2c2efe33d5f388638d8b9c7b21f8a2eab12bd28e --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/misc/print_config.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os + +from mmdet.utils import replace_cfg_vals, update_data_root +from mmengine import Config, DictAction + + +def parse_args(): + parser = argparse.ArgumentParser(description='Print the whole config') + parser.add_argument('config', help='config file path') + parser.add_argument( + '--save-path', + default=None, + help='save path of whole config, suffixed with .py, .json or .yml') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + + # replace the ${key} with the value of cfg.key + cfg = replace_cfg_vals(cfg) + + # update data root according to MMDET_DATASETS + update_data_root(cfg) + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + print(f'Config:\n{cfg.pretty_text}') + + if args.save_path is not None: + save_path = args.save_path + + suffix = os.path.splitext(save_path)[-1] + assert suffix in ['.py', '.json', '.yml'] + + if not os.path.exists(os.path.split(save_path)[0]): + os.makedirs(os.path.split(save_path)[0]) + cfg.dump(save_path) + print(f'Config saving at {save_path}') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/misc/publish_model.py b/models/YOLO-World/third_party/mmyolo/tools/misc/publish_model.py new file mode 100644 index 0000000000000000000000000000000000000000..a2ccbf080a4b162fe05d542409eec7d3b6441118 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/misc/publish_model.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import subprocess + +import torch + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Process a checkpoint to be published') + parser.add_argument('in_file', help='input checkpoint filename') + parser.add_argument('out_file', help='output checkpoint filename') + args = parser.parse_args() + return args + + +def process_checkpoint(in_file, out_file): + checkpoint = torch.load(in_file, map_location='cpu') + + # remove optimizer for smaller file size + if 'optimizer' in checkpoint: + del checkpoint['optimizer'] + if 'message_hub' in checkpoint: + del checkpoint['message_hub'] + if 'ema_state_dict' in checkpoint: + del checkpoint['ema_state_dict'] + + for key in list(checkpoint['state_dict']): + if key.startswith('data_preprocessor'): + checkpoint['state_dict'].pop(key) + elif 'priors_base_sizes' in key: + checkpoint['state_dict'].pop(key) + elif 'grid_offset' in key: + checkpoint['state_dict'].pop(key) + elif 'prior_inds' in key: + checkpoint['state_dict'].pop(key) + + if torch.__version__ >= '1.6': + torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False) + else: + torch.save(checkpoint, out_file) + sha = subprocess.check_output(['sha256sum', out_file]).decode() + if out_file.endswith('.pth'): + out_file_name = out_file[:-4] + else: + out_file_name = out_file + final_file = out_file_name + f'-{sha[:8]}.pth' + subprocess.Popen(['mv', out_file, final_file]) + + +def main(): + args = parse_args() + process_checkpoint(args.in_file, args.out_file) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/convert_kd_ckpt_to_student.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/convert_kd_ckpt_to_student.py new file mode 100644 index 0000000000000000000000000000000000000000..d2f787e47584d3edbed2269760832670530c146b --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/convert_kd_ckpt_to_student.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +from pathlib import Path + +from mmengine.runner import CheckpointLoader, save_checkpoint +from mmengine.utils import mkdir_or_exist + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert KD checkpoint to student-only checkpoint') + parser.add_argument('checkpoint', help='input checkpoint filename') + parser.add_argument('--out-path', help='save checkpoint path') + parser.add_argument( + '--inplace', action='store_true', help='replace origin ckpt') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + checkpoint = CheckpointLoader.load_checkpoint( + args.checkpoint, map_location='cpu') + new_state_dict = dict() + new_meta = checkpoint['meta'] + + for key, value in checkpoint['state_dict'].items(): + if key.startswith('architecture.'): + new_key = key.replace('architecture.', '') + new_state_dict[new_key] = value + + checkpoint = dict() + checkpoint['meta'] = new_meta + checkpoint['state_dict'] = new_state_dict + + if args.inplace: + assert osp.exists(args.checkpoint), \ + 'can not find the checkpoint path: {args.checkpoint}' + save_checkpoint(checkpoint, args.checkpoint) + else: + ckpt_path = Path(args.checkpoint) + ckpt_name = ckpt_path.stem + if args.out_path: + ckpt_dir = Path(args.out_path) + else: + ckpt_dir = ckpt_path.parent + mkdir_or_exist(ckpt_dir) + new_ckpt_path = osp.join(ckpt_dir, f'{ckpt_name}_student.pth') + save_checkpoint(checkpoint, new_ckpt_path) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/ppyoloe_to_mmyolo.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/ppyoloe_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..75c4af6963a8c58834507dd823930b1f9fcab6ac --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/ppyoloe_to_mmyolo.py @@ -0,0 +1,184 @@ +import argparse +import pickle +from collections import OrderedDict + +import torch + + +def convert_bn(k: str): + name = k.replace('._mean', + '.running_mean').replace('._variance', '.running_var') + return name + + +def convert_repvgg(k: str): + if '.conv2.conv1.' in k: + name = k.replace('.conv2.conv1.', '.conv2.rbr_dense.') + return name + elif '.conv2.conv2.' in k: + name = k.replace('.conv2.conv2.', '.conv2.rbr_1x1.') + return name + else: + return k + + +def convert(src: str, dst: str, imagenet_pretrain: bool = False): + with open(src, 'rb') as f: + model = pickle.load(f) + + new_state_dict = OrderedDict() + if imagenet_pretrain: + for k, v in model.items(): + if '@@' in k: + continue + if 'stem.' in k: + # backbone.stem.conv1.conv.weight + # -> backbone.stem.0.conv.weight + org_ind = k.split('.')[1][-1] + new_ind = str(int(org_ind) - 1) + name = k.replace('stem.conv%s.' % org_ind, + 'stem.%s.' % new_ind) + else: + # backbone.stages.1.conv2.bn._variance + # -> backbone.stage2.0.conv2.bn.running_var + org_stage_ind = k.split('.')[1] + new_stage_ind = str(int(org_stage_ind) + 1) + name = k.replace('stages.%s.' % org_stage_ind, + 'stage%s.0.' % new_stage_ind) + name = convert_repvgg(name) + if '.attn.' in k: + name = name.replace('.attn.fc.', '.attn.fc.conv.') + name = convert_bn(name) + name = 'backbone.' + name + + new_state_dict[name] = torch.from_numpy(v) + else: + for k, v in model.items(): + name = k + if k.startswith('backbone.'): + if '.stem.' in k: + # backbone.stem.conv1.conv.weight + # -> backbone.stem.0.conv.weight + org_ind = k.split('.')[2][-1] + new_ind = str(int(org_ind) - 1) + name = k.replace('.stem.conv%s.' % org_ind, + '.stem.%s.' % new_ind) + else: + # backbone.stages.1.conv2.bn._variance + # -> backbone.stage2.0.conv2.bn.running_var + org_stage_ind = k.split('.')[2] + new_stage_ind = str(int(org_stage_ind) + 1) + name = k.replace('.stages.%s.' % org_stage_ind, + '.stage%s.0.' % new_stage_ind) + name = convert_repvgg(name) + if '.attn.' in k: + name = name.replace('.attn.fc.', '.attn.fc.conv.') + name = convert_bn(name) + elif k.startswith('neck.'): + # fpn_stages + if k.startswith('neck.fpn_stages.'): + # neck.fpn_stages.0.0.conv1.conv.weight + # -> neck.reduce_layers.2.0.conv1.conv.weight + if k.startswith('neck.fpn_stages.0.0.'): + name = k.replace('neck.fpn_stages.0.0.', + 'neck.reduce_layers.2.0.') + if '.spp.' in name: + name = name.replace('.spp.conv.', '.spp.conv2.') + # neck.fpn_stages.1.0.conv1.conv.weight + # -> neck.top_down_layers.0.0.conv1.conv.weight + elif k.startswith('neck.fpn_stages.1.0.'): + name = k.replace('neck.fpn_stages.1.0.', + 'neck.top_down_layers.0.0.') + elif k.startswith('neck.fpn_stages.2.0.'): + name = k.replace('neck.fpn_stages.2.0.', + 'neck.top_down_layers.1.0.') + else: + raise NotImplementedError('Not implemented.') + name = name.replace('.0.convs.', '.0.blocks.') + elif k.startswith('neck.fpn_routes.'): + # neck.fpn_routes.0.conv.weight + # -> neck.upsample_layers.0.0.conv.weight + index = k.split('.')[2] + name = 'neck.upsample_layers.' + index + '.0.' + '.'.join( + k.split('.')[-2:]) + name = name.replace('.0.convs.', '.0.blocks.') + elif k.startswith('neck.pan_stages.'): + # neck.pan_stages.0.0.conv1.conv.weight + # -> neck.bottom_up_layers.1.0.conv1.conv.weight + ind = k.split('.')[2] + name = k.replace( + 'neck.pan_stages.' + ind, 'neck.bottom_up_layers.' + + ('0' if ind == '1' else '1')) + name = name.replace('.0.convs.', '.0.blocks.') + elif k.startswith('neck.pan_routes.'): + # neck.pan_routes.0.conv.weight + # -> neck.downsample_layers.0.conv.weight + ind = k.split('.')[2] + name = k.replace( + 'neck.pan_routes.' + ind, 'neck.downsample_layers.' + + ('0' if ind == '1' else '1')) + name = name.replace('.0.convs.', '.0.blocks.') + + else: + raise NotImplementedError('Not implement.') + name = convert_repvgg(name) + name = convert_bn(name) + elif k.startswith('yolo_head.'): + if ('anchor_points' in k) or ('stride_tensor' in k): + continue + if 'proj_conv' in k: + name = k.replace('yolo_head.proj_conv.', + 'bbox_head.head_module.proj_conv.') + else: + for org_key, rep_key in [ + [ + 'yolo_head.stem_cls.', + 'bbox_head.head_module.cls_stems.' + ], + [ + 'yolo_head.stem_reg.', + 'bbox_head.head_module.reg_stems.' + ], + [ + 'yolo_head.pred_cls.', + 'bbox_head.head_module.cls_preds.' + ], + [ + 'yolo_head.pred_reg.', + 'bbox_head.head_module.reg_preds.' + ] + ]: + name = name.replace(org_key, rep_key) + name = name.split('.') + ind = name[3] + name[3] = str(2 - int(ind)) + name = '.'.join(name) + name = convert_bn(name) + else: + continue + + new_state_dict[name] = torch.from_numpy(v) + data = {'state_dict': new_state_dict} + torch.save(data, dst) + + +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', + default='ppyoloe_plus_crn_s_80e_coco.pdparams', + help='src ppyoloe model path') + parser.add_argument( + '--dst', default='mmppyoloe_plus_s.pt', help='save path') + parser.add_argument( + '--imagenet-pretrain', + action='store_true', + default=False, + help='Load model pretrained on imagenet dataset which only ' + 'have weight for backbone.') + args = parser.parse_args() + convert(args.src, args.dst, args.imagenet_pretrain) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/rtmdet_to_mmyolo.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/rtmdet_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6f237d44464fdfb8882c898f332ef51ba12ae8 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/rtmdet_to_mmyolo.py @@ -0,0 +1,61 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + + +def convert(src, dst): + """Convert keys in pretrained RTMDet models to MMYOLO style.""" + blobs = torch.load(src)['state_dict'] + state_dict = OrderedDict() + + for key, weight in blobs.items(): + if 'neck.reduce_layers.0' in key: + new_key = key.replace('.0', '.2') + state_dict[new_key] = weight + elif 'neck.reduce_layers.1' in key: + new_key = key.replace('reduce_layers.1', 'top_down_layers.0.1') + state_dict[new_key] = weight + elif 'neck.top_down_blocks.0' in key: + new_key = key.replace('down_blocks', 'down_layers.0') + state_dict[new_key] = weight + elif 'neck.top_down_blocks.1' in key: + new_key = key.replace('down_blocks', 'down_layers') + state_dict[new_key] = weight + elif 'downsamples' in key: + new_key = key.replace('downsamples', 'downsample_layers') + state_dict[new_key] = weight + elif 'bottom_up_blocks' in key: + new_key = key.replace('bottom_up_blocks', 'bottom_up_layers') + state_dict[new_key] = weight + elif 'out_convs' in key: + new_key = key.replace('out_convs', 'out_layers') + state_dict[new_key] = weight + elif 'bbox_head' in key: + new_key = key.replace('bbox_head', 'bbox_head.head_module') + state_dict[new_key] = weight + elif 'data_preprocessor' in key: + continue + else: + new_key = key + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + checkpoint['meta'] = blobs.get('meta') + torch.save(checkpoint, dst) + + +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument('src', help='src rtm model path') + parser.add_argument('dst', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov5_to_mmyolo.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov5_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..a4e62a2f7787444862990e35d1fb20c0be9f0961 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov5_to_mmyolo.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + +convert_dict_p5 = { + 'model.0': 'backbone.stem', + 'model.1': 'backbone.stage1.0', + 'model.2': 'backbone.stage1.1', + 'model.3': 'backbone.stage2.0', + 'model.4': 'backbone.stage2.1', + 'model.5': 'backbone.stage3.0', + 'model.6': 'backbone.stage3.1', + 'model.7': 'backbone.stage4.0', + 'model.8': 'backbone.stage4.1', + 'model.9.cv1': 'backbone.stage4.2.conv1', + 'model.9.cv2': 'backbone.stage4.2.conv2', + 'model.10': 'neck.reduce_layers.2', + 'model.13': 'neck.top_down_layers.0.0', + 'model.14': 'neck.top_down_layers.0.1', + 'model.17': 'neck.top_down_layers.1', + 'model.18': 'neck.downsample_layers.0', + 'model.20': 'neck.bottom_up_layers.0', + 'model.21': 'neck.downsample_layers.1', + 'model.23': 'neck.bottom_up_layers.1', + 'model.24.m': 'bbox_head.head_module.convs_pred', + 'model.24.proto': 'bbox_head.head_module.proto_preds', +} + +convert_dict_p6 = { + 'model.0': 'backbone.stem', + 'model.1': 'backbone.stage1.0', + 'model.2': 'backbone.stage1.1', + 'model.3': 'backbone.stage2.0', + 'model.4': 'backbone.stage2.1', + 'model.5': 'backbone.stage3.0', + 'model.6': 'backbone.stage3.1', + 'model.7': 'backbone.stage4.0', + 'model.8': 'backbone.stage4.1', + 'model.9': 'backbone.stage5.0', + 'model.10': 'backbone.stage5.1', + 'model.11.cv1': 'backbone.stage5.2.conv1', + 'model.11.cv2': 'backbone.stage5.2.conv2', + 'model.12': 'neck.reduce_layers.3', + 'model.15': 'neck.top_down_layers.0.0', + 'model.16': 'neck.top_down_layers.0.1', + 'model.19': 'neck.top_down_layers.1.0', + 'model.20': 'neck.top_down_layers.1.1', + 'model.23': 'neck.top_down_layers.2', + 'model.24': 'neck.downsample_layers.0', + 'model.26': 'neck.bottom_up_layers.0', + 'model.27': 'neck.downsample_layers.1', + 'model.29': 'neck.bottom_up_layers.1', + 'model.30': 'neck.downsample_layers.2', + 'model.32': 'neck.bottom_up_layers.2', + 'model.33.m': 'bbox_head.head_module.convs_pred', + 'model.33.proto': 'bbox_head.head_module.proto_preds', +} + + +def convert(src, dst): + """Convert keys in pretrained YOLOv5 models to mmyolo style.""" + if src.endswith('6.pt'): + convert_dict = convert_dict_p6 + is_p6_model = True + print('Converting P6 model') + else: + convert_dict = convert_dict_p5 + is_p6_model = False + print('Converting P5 model') + try: + yolov5_model = torch.load(src)['model'] + blobs = yolov5_model.state_dict() + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the ultralytics/yolov5 repo,' + ' because loading the official pretrained model need' + ' `model.py` to build model.') + state_dict = OrderedDict() + + for key, weight in blobs.items(): + + num, module = key.split('.')[1:3] + if (is_p6_model and + (num == '11' or num == '33')) or (not is_p6_model and + (num == '9' or num == '24')): + if module == 'anchors': + continue + prefix = f'model.{num}.{module}' + else: + prefix = f'model.{num}' + + new_key = key.replace(prefix, convert_dict[prefix]) + + if '.m.' in new_key: + new_key = new_key.replace('.m.', '.blocks.') + new_key = new_key.replace('.cv', '.conv') + elif 'bbox_head.head_module.proto_preds.cv' in new_key: + new_key = new_key.replace( + 'bbox_head.head_module.proto_preds.cv', + 'bbox_head.head_module.proto_preds.conv') + else: + new_key = new_key.replace('.cv1', '.main_conv') + new_key = new_key.replace('.cv2', '.short_conv') + new_key = new_key.replace('.cv3', '.final_conv') + + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +# Note: This script must be placed under the yolov5 repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov5s.pt', help='src yolov5 model path') + parser.add_argument('--dst', default='mmyolov5s.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov5u_to_mmyolo.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov5u_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..806c76cb47b17a3b0291f80e092e7b8d9856a0ab --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov5u_to_mmyolo.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + +convert_dict_p5 = { + 'model.0': 'backbone.stem', + 'model.1': 'backbone.stage1.0', + 'model.2': 'backbone.stage1.1', + 'model.3': 'backbone.stage2.0', + 'model.4': 'backbone.stage2.1', + 'model.5': 'backbone.stage3.0', + 'model.6': 'backbone.stage3.1', + 'model.7': 'backbone.stage4.0', + 'model.8': 'backbone.stage4.1', + 'model.9': 'backbone.stage4.2', + 'model.10': 'neck.reduce_layers.2', + 'model.13': 'neck.top_down_layers.0.0', + 'model.14': 'neck.top_down_layers.0.1', + 'model.17': 'neck.top_down_layers.1', + 'model.18': 'neck.downsample_layers.0', + 'model.20': 'neck.bottom_up_layers.0', + 'model.21': 'neck.downsample_layers.1', + 'model.23': 'neck.bottom_up_layers.1', + 'model.24': 'bbox_head.head_module', +} + + +def convert(src, dst): + """Convert keys in pretrained YOLOv5u models to mmyolo style.""" + convert_dict = convert_dict_p5 + + print('Converting P5 model') + try: + yolov5_model = torch.load(src)['model'] + blobs = yolov5_model.state_dict() + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the ultralytics repo,' + ' because loading the official pretrained model need' + ' `model.py` to build model.') + state_dict = OrderedDict() + + for key, weight in blobs.items(): + + num, module = key.split('.')[1:3] + prefix = f'model.{num}' + new_key = key.replace(prefix, convert_dict[prefix]) + + if '.m.' in new_key: + new_key = new_key.replace('.m.', '.blocks.') + new_key = new_key.replace('.cv', '.conv') + elif 'bbox_head.head_module' in new_key: + new_key = new_key.replace('.cv2', '.reg_preds') + new_key = new_key.replace('.cv3', '.cls_preds') + elif 'backbone.stage4.2' in new_key: + new_key = new_key.replace('.cv', '.conv') + else: + new_key = new_key.replace('.cv1', '.main_conv') + new_key = new_key.replace('.cv2', '.short_conv') + new_key = new_key.replace('.cv3', '.final_conv') + + if 'bbox_head.head_module.dfl.conv.weight' == new_key: + print('Drop "bbox_head.head_module.dfl.conv.weight", ' + 'because it is useless') + continue + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +# Note: This script must be placed under the ultralytics repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov5su.pt', help='src yolov5u model path') + parser.add_argument('--dst', default='mmyolov5su.pth', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov6_to_mmyolo.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov6_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..e9e86ab46d6cae30aede92ea3598291fbcd009a7 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov6_to_mmyolo.py @@ -0,0 +1,115 @@ +import argparse +from collections import OrderedDict + +import torch + + +def convert(src, dst): + import sys + sys.path.append('yolov6') + try: + ckpt = torch.load(src, map_location=torch.device('cpu')) + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the meituan/YOLOv6 repo,' + ' because loading the official pretrained model need' + ' some python files to build model.') + # The saved model is the model before reparameterization + model = ckpt['ema' if ckpt.get('ema') else 'model'].float() + new_state_dict = OrderedDict() + for k, v in model.state_dict().items(): + name = k + if 'detect' in k: + if 'proj' in k: + continue + name = k.replace('detect', 'bbox_head.head_module') + if k.find('anchors') >= 0 or k.find('anchor_grid') >= 0: + continue + + if 'ERBlock_2' in k: + name = k.replace('ERBlock_2', 'stage1.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_3' in k: + name = k.replace('ERBlock_3', 'stage2.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_4' in k: + name = k.replace('ERBlock_4', 'stage3.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_5' in k: + name = k.replace('ERBlock_5', 'stage4.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + if 'stage4.0.2' in name: + name = name.replace('stage4.0.2', 'stage4.1') + name = name.replace('cv', 'conv') + elif 'reduce_layer0' in k: + name = k.replace('reduce_layer0', 'reduce_layers.2') + elif 'Rep_p4' in k: + name = k.replace('Rep_p4', 'top_down_layers.0.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'reduce_layer1' in k: + name = k.replace('reduce_layer1', 'top_down_layers.0.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Rep_p3' in k: + name = k.replace('Rep_p3', 'top_down_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'upsample0' in k: + name = k.replace('upsample0.upsample_transpose', + 'upsample_layers.0') + elif 'upsample1' in k: + name = k.replace('upsample1.upsample_transpose', + 'upsample_layers.1') + elif 'Rep_n3' in k: + name = k.replace('Rep_n3', 'bottom_up_layers.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Rep_n4' in k: + name = k.replace('Rep_n4', 'bottom_up_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'downsample2' in k: + name = k.replace('downsample2', 'downsample_layers.0') + elif 'downsample1' in k: + name = k.replace('downsample1', 'downsample_layers.1') + + new_state_dict[name] = v + data = {'state_dict': new_state_dict} + torch.save(data, dst) + + +# Note: This script must be placed under the yolov6 repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov6s.pt', help='src yolov6 model path') + parser.add_argument('--dst', default='mmyolov6.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov6_v3_to_mmyolo.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov6_v3_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..bc87664241eb699454c165aa1d760d1da910f7dd --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov6_v3_to_mmyolo.py @@ -0,0 +1,145 @@ +import argparse +from collections import OrderedDict + +import torch + + +def convert(src, dst): + import sys + sys.path.append('yolov6') + try: + ckpt = torch.load(src, map_location=torch.device('cpu')) + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the meituan/YOLOv6 repo,' + ' because loading the official pretrained model need' + ' some python files to build model.') + # The saved model is the model before reparameterization + model = ckpt['ema' if ckpt.get('ema') else 'model'].float() + new_state_dict = OrderedDict() + is_ns = False + for k, v in model.state_dict().items(): + name = k + if 'detect' in k: + if 'proj' in k: + continue + if 'reg_preds_lrtb' in k: + is_ns = True + name = k.replace('detect', 'bbox_head.head_module') + if k.find('anchors') >= 0 or k.find('anchor_grid') >= 0: + continue + + if 'ERBlock_2' in k: + name = k.replace('ERBlock_2', 'stage1.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_3' in k: + name = k.replace('ERBlock_3', 'stage2.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_4' in k: + name = k.replace('ERBlock_4', 'stage3.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_5' in k: + name = k.replace('ERBlock_5', 'stage4.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + if 'stage4.0.2' in name: + name = name.replace('stage4.0.2', 'stage4.1') + name = name.replace('cv', 'conv') + elif 'reduce_layer0' in k: + name = k.replace('reduce_layer0', 'reduce_layers.2') + elif 'Rep_p4' in k: + name = k.replace('Rep_p4', 'top_down_layers.0.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'reduce_layer1' in k: + name = k.replace('reduce_layer1', 'top_down_layers.0.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Rep_p3' in k: + name = k.replace('Rep_p3', 'top_down_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Bifusion0' in k: + name = k.replace('Bifusion0', 'upsample_layers.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + if '.upsample_transpose.' in k: + name = name.replace('.upsample_transpose.', '.') + elif 'Bifusion1' in k: + name = k.replace('Bifusion1', 'upsample_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + if '.upsample_transpose.' in k: + name = name.replace('.upsample_transpose.', '.') + elif 'Rep_n3' in k: + name = k.replace('Rep_n3', 'bottom_up_layers.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Rep_n4' in k: + name = k.replace('Rep_n4', 'bottom_up_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'downsample2' in k: + name = k.replace('downsample2', 'downsample_layers.0') + elif 'downsample1' in k: + name = k.replace('downsample1', 'downsample_layers.1') + + new_state_dict[name] = v + + # The yolov6_v3_n/s has two regression heads. + # One called 'reg_preds_lrtb' is a regular anchor-free head, + # which is used for inference. + # One called 'reg_preds' is a DFL style head, which + # is only used in training. + if is_ns: + tmp_state_dict = OrderedDict() + for k, v in new_state_dict.items(): + name = k + if 'reg_preds_lrtb' in k: + name = k.replace('reg_preds_lrtb', 'reg_preds') + elif 'reg_preds' in k: + name = k.replace('reg_preds', 'distill_ns_head') + tmp_state_dict[name] = v + new_state_dict = tmp_state_dict + + data = {'state_dict': new_state_dict} + torch.save(data, dst) + + +# Note: This script must be placed under the yolov6 repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov6s.pt', help='src yolov6 model path') + parser.add_argument('--dst', default='mmyolov6.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov7_to_mmyolo.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov7_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..1c1f54d0cbf0375dc026c8e6fb234ce9335d85cc --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov7_to_mmyolo.py @@ -0,0 +1,1093 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +from collections import OrderedDict + +import torch + +convert_dict_tiny = { + # stem + 'model.0': 'backbone.stem.0', + 'model.1': 'backbone.stem.1', + + # stage1 TinyDownSampleBlock + 'model.2': 'backbone.stage1.0.short_conv', + 'model.3': 'backbone.stage1.0.main_convs.0', + 'model.4': 'backbone.stage1.0.main_convs.1', + 'model.5': 'backbone.stage1.0.main_convs.2', + 'model.7': 'backbone.stage1.0.final_conv', + + # stage2 TinyDownSampleBlock + 'model.9': 'backbone.stage2.1.short_conv', + 'model.10': 'backbone.stage2.1.main_convs.0', + 'model.11': 'backbone.stage2.1.main_convs.1', + 'model.12': 'backbone.stage2.1.main_convs.2', + 'model.14': 'backbone.stage2.1.final_conv', + + # stage3 TinyDownSampleBlock + 'model.16': 'backbone.stage3.1.short_conv', + 'model.17': 'backbone.stage3.1.main_convs.0', + 'model.18': 'backbone.stage3.1.main_convs.1', + 'model.19': 'backbone.stage3.1.main_convs.2', + 'model.21': 'backbone.stage3.1.final_conv', + + # stage4 TinyDownSampleBlock + 'model.23': 'backbone.stage4.1.short_conv', + 'model.24': 'backbone.stage4.1.main_convs.0', + 'model.25': 'backbone.stage4.1.main_convs.1', + 'model.26': 'backbone.stage4.1.main_convs.2', + 'model.28': 'backbone.stage4.1.final_conv', + + # neck SPPCSPBlock + 'model.29': 'neck.reduce_layers.2.short_layer', + 'model.30': 'neck.reduce_layers.2.main_layers', + 'model.35': 'neck.reduce_layers.2.fuse_layers', + 'model.37': 'neck.reduce_layers.2.final_conv', + 'model.38': 'neck.upsample_layers.0.0', + 'model.40': 'neck.reduce_layers.1', + 'model.42': 'neck.top_down_layers.0.short_conv', + 'model.43': 'neck.top_down_layers.0.main_convs.0', + 'model.44': 'neck.top_down_layers.0.main_convs.1', + 'model.45': 'neck.top_down_layers.0.main_convs.2', + 'model.47': 'neck.top_down_layers.0.final_conv', + 'model.48': 'neck.upsample_layers.1.0', + 'model.50': 'neck.reduce_layers.0', + 'model.52': 'neck.top_down_layers.1.short_conv', + 'model.53': 'neck.top_down_layers.1.main_convs.0', + 'model.54': 'neck.top_down_layers.1.main_convs.1', + 'model.55': 'neck.top_down_layers.1.main_convs.2', + 'model.57': 'neck.top_down_layers.1.final_conv', + 'model.58': 'neck.downsample_layers.0', + 'model.60': 'neck.bottom_up_layers.0.short_conv', + 'model.61': 'neck.bottom_up_layers.0.main_convs.0', + 'model.62': 'neck.bottom_up_layers.0.main_convs.1', + 'model.63': 'neck.bottom_up_layers.0.main_convs.2', + 'model.65': 'neck.bottom_up_layers.0.final_conv', + 'model.66': 'neck.downsample_layers.1', + 'model.68': 'neck.bottom_up_layers.1.short_conv', + 'model.69': 'neck.bottom_up_layers.1.main_convs.0', + 'model.70': 'neck.bottom_up_layers.1.main_convs.1', + 'model.71': 'neck.bottom_up_layers.1.main_convs.2', + 'model.73': 'neck.bottom_up_layers.1.final_conv', + 'model.74': 'neck.out_layers.0', + 'model.75': 'neck.out_layers.1', + 'model.76': 'neck.out_layers.2', + + # head + 'model.77.m.0': 'bbox_head.head_module.convs_pred.0.1', + 'model.77.m.1': 'bbox_head.head_module.convs_pred.1.1', + 'model.77.m.2': 'bbox_head.head_module.convs_pred.2.1' +} + +convert_dict_l = { + # stem + 'model.0': 'backbone.stem.0', + 'model.1': 'backbone.stem.1', + 'model.2': 'backbone.stem.2', + + # stage1 + # ConvModule + 'model.3': 'backbone.stage1.0', + # ELANBlock expand_channel_2x + 'model.4': 'backbone.stage1.1.short_conv', + 'model.5': 'backbone.stage1.1.main_conv', + 'model.6': 'backbone.stage1.1.blocks.0.0', + 'model.7': 'backbone.stage1.1.blocks.0.1', + 'model.8': 'backbone.stage1.1.blocks.1.0', + 'model.9': 'backbone.stage1.1.blocks.1.1', + 'model.11': 'backbone.stage1.1.final_conv', + + # stage2 + # MaxPoolBlock reduce_channel_2x + 'model.13': 'backbone.stage2.0.maxpool_branches.1', + 'model.14': 'backbone.stage2.0.stride_conv_branches.0', + 'model.15': 'backbone.stage2.0.stride_conv_branches.1', + # ELANBlock expand_channel_2x + 'model.17': 'backbone.stage2.1.short_conv', + 'model.18': 'backbone.stage2.1.main_conv', + 'model.19': 'backbone.stage2.1.blocks.0.0', + 'model.20': 'backbone.stage2.1.blocks.0.1', + 'model.21': 'backbone.stage2.1.blocks.1.0', + 'model.22': 'backbone.stage2.1.blocks.1.1', + 'model.24': 'backbone.stage2.1.final_conv', + + # stage3 + # MaxPoolBlock reduce_channel_2x + 'model.26': 'backbone.stage3.0.maxpool_branches.1', + 'model.27': 'backbone.stage3.0.stride_conv_branches.0', + 'model.28': 'backbone.stage3.0.stride_conv_branches.1', + # ELANBlock expand_channel_2x + 'model.30': 'backbone.stage3.1.short_conv', + 'model.31': 'backbone.stage3.1.main_conv', + 'model.32': 'backbone.stage3.1.blocks.0.0', + 'model.33': 'backbone.stage3.1.blocks.0.1', + 'model.34': 'backbone.stage3.1.blocks.1.0', + 'model.35': 'backbone.stage3.1.blocks.1.1', + 'model.37': 'backbone.stage3.1.final_conv', + + # stage4 + # MaxPoolBlock reduce_channel_2x + 'model.39': 'backbone.stage4.0.maxpool_branches.1', + 'model.40': 'backbone.stage4.0.stride_conv_branches.0', + 'model.41': 'backbone.stage4.0.stride_conv_branches.1', + # ELANBlock no_change_channel + 'model.43': 'backbone.stage4.1.short_conv', + 'model.44': 'backbone.stage4.1.main_conv', + 'model.45': 'backbone.stage4.1.blocks.0.0', + 'model.46': 'backbone.stage4.1.blocks.0.1', + 'model.47': 'backbone.stage4.1.blocks.1.0', + 'model.48': 'backbone.stage4.1.blocks.1.1', + 'model.50': 'backbone.stage4.1.final_conv', + + # neck SPPCSPBlock + 'model.51.cv1': 'neck.reduce_layers.2.main_layers.0', + 'model.51.cv3': 'neck.reduce_layers.2.main_layers.1', + 'model.51.cv4': 'neck.reduce_layers.2.main_layers.2', + 'model.51.cv5': 'neck.reduce_layers.2.fuse_layers.0', + 'model.51.cv6': 'neck.reduce_layers.2.fuse_layers.1', + 'model.51.cv2': 'neck.reduce_layers.2.short_layer', + 'model.51.cv7': 'neck.reduce_layers.2.final_conv', + + # neck + 'model.52': 'neck.upsample_layers.0.0', + 'model.54': 'neck.reduce_layers.1', + + # neck ELANBlock reduce_channel_2x + 'model.56': 'neck.top_down_layers.0.short_conv', + 'model.57': 'neck.top_down_layers.0.main_conv', + 'model.58': 'neck.top_down_layers.0.blocks.0', + 'model.59': 'neck.top_down_layers.0.blocks.1', + 'model.60': 'neck.top_down_layers.0.blocks.2', + 'model.61': 'neck.top_down_layers.0.blocks.3', + 'model.63': 'neck.top_down_layers.0.final_conv', + 'model.64': 'neck.upsample_layers.1.0', + 'model.66': 'neck.reduce_layers.0', + + # neck ELANBlock reduce_channel_2x + 'model.68': 'neck.top_down_layers.1.short_conv', + 'model.69': 'neck.top_down_layers.1.main_conv', + 'model.70': 'neck.top_down_layers.1.blocks.0', + 'model.71': 'neck.top_down_layers.1.blocks.1', + 'model.72': 'neck.top_down_layers.1.blocks.2', + 'model.73': 'neck.top_down_layers.1.blocks.3', + 'model.75': 'neck.top_down_layers.1.final_conv', + + # neck MaxPoolBlock no_change_channel + 'model.77': 'neck.downsample_layers.0.maxpool_branches.1', + 'model.78': 'neck.downsample_layers.0.stride_conv_branches.0', + 'model.79': 'neck.downsample_layers.0.stride_conv_branches.1', + + # neck ELANBlock reduce_channel_2x + 'model.81': 'neck.bottom_up_layers.0.short_conv', + 'model.82': 'neck.bottom_up_layers.0.main_conv', + 'model.83': 'neck.bottom_up_layers.0.blocks.0', + 'model.84': 'neck.bottom_up_layers.0.blocks.1', + 'model.85': 'neck.bottom_up_layers.0.blocks.2', + 'model.86': 'neck.bottom_up_layers.0.blocks.3', + 'model.88': 'neck.bottom_up_layers.0.final_conv', + + # neck MaxPoolBlock no_change_channel + 'model.90': 'neck.downsample_layers.1.maxpool_branches.1', + 'model.91': 'neck.downsample_layers.1.stride_conv_branches.0', + 'model.92': 'neck.downsample_layers.1.stride_conv_branches.1', + + # neck ELANBlock reduce_channel_2x + 'model.94': 'neck.bottom_up_layers.1.short_conv', + 'model.95': 'neck.bottom_up_layers.1.main_conv', + 'model.96': 'neck.bottom_up_layers.1.blocks.0', + 'model.97': 'neck.bottom_up_layers.1.blocks.1', + 'model.98': 'neck.bottom_up_layers.1.blocks.2', + 'model.99': 'neck.bottom_up_layers.1.blocks.3', + 'model.101': 'neck.bottom_up_layers.1.final_conv', + + # RepVGGBlock + 'model.102.rbr_dense.0': 'neck.out_layers.0.rbr_dense.conv', + 'model.102.rbr_dense.1': 'neck.out_layers.0.rbr_dense.bn', + 'model.102.rbr_1x1.0': 'neck.out_layers.0.rbr_1x1.conv', + 'model.102.rbr_1x1.1': 'neck.out_layers.0.rbr_1x1.bn', + 'model.103.rbr_dense.0': 'neck.out_layers.1.rbr_dense.conv', + 'model.103.rbr_dense.1': 'neck.out_layers.1.rbr_dense.bn', + 'model.103.rbr_1x1.0': 'neck.out_layers.1.rbr_1x1.conv', + 'model.103.rbr_1x1.1': 'neck.out_layers.1.rbr_1x1.bn', + 'model.104.rbr_dense.0': 'neck.out_layers.2.rbr_dense.conv', + 'model.104.rbr_dense.1': 'neck.out_layers.2.rbr_dense.bn', + 'model.104.rbr_1x1.0': 'neck.out_layers.2.rbr_1x1.conv', + 'model.104.rbr_1x1.1': 'neck.out_layers.2.rbr_1x1.bn', + + # head + 'model.105.m.0': 'bbox_head.head_module.convs_pred.0.1', + 'model.105.m.1': 'bbox_head.head_module.convs_pred.1.1', + 'model.105.m.2': 'bbox_head.head_module.convs_pred.2.1' +} + +convert_dict_x = { + # stem + 'model.0': 'backbone.stem.0', + 'model.1': 'backbone.stem.1', + 'model.2': 'backbone.stem.2', + + # stage1 + # ConvModule + 'model.3': 'backbone.stage1.0', + # ELANBlock expand_channel_2x + 'model.4': 'backbone.stage1.1.short_conv', + 'model.5': 'backbone.stage1.1.main_conv', + 'model.6': 'backbone.stage1.1.blocks.0.0', + 'model.7': 'backbone.stage1.1.blocks.0.1', + 'model.8': 'backbone.stage1.1.blocks.1.0', + 'model.9': 'backbone.stage1.1.blocks.1.1', + 'model.10': 'backbone.stage1.1.blocks.2.0', + 'model.11': 'backbone.stage1.1.blocks.2.1', + 'model.13': 'backbone.stage1.1.final_conv', + + # stage2 + # MaxPoolBlock reduce_channel_2x + 'model.15': 'backbone.stage2.0.maxpool_branches.1', + 'model.16': 'backbone.stage2.0.stride_conv_branches.0', + 'model.17': 'backbone.stage2.0.stride_conv_branches.1', + + # ELANBlock expand_channel_2x + 'model.19': 'backbone.stage2.1.short_conv', + 'model.20': 'backbone.stage2.1.main_conv', + 'model.21': 'backbone.stage2.1.blocks.0.0', + 'model.22': 'backbone.stage2.1.blocks.0.1', + 'model.23': 'backbone.stage2.1.blocks.1.0', + 'model.24': 'backbone.stage2.1.blocks.1.1', + 'model.25': 'backbone.stage2.1.blocks.2.0', + 'model.26': 'backbone.stage2.1.blocks.2.1', + 'model.28': 'backbone.stage2.1.final_conv', + + # stage3 + # MaxPoolBlock reduce_channel_2x + 'model.30': 'backbone.stage3.0.maxpool_branches.1', + 'model.31': 'backbone.stage3.0.stride_conv_branches.0', + 'model.32': 'backbone.stage3.0.stride_conv_branches.1', + # ELANBlock expand_channel_2x + 'model.34': 'backbone.stage3.1.short_conv', + 'model.35': 'backbone.stage3.1.main_conv', + 'model.36': 'backbone.stage3.1.blocks.0.0', + 'model.37': 'backbone.stage3.1.blocks.0.1', + 'model.38': 'backbone.stage3.1.blocks.1.0', + 'model.39': 'backbone.stage3.1.blocks.1.1', + 'model.40': 'backbone.stage3.1.blocks.2.0', + 'model.41': 'backbone.stage3.1.blocks.2.1', + 'model.43': 'backbone.stage3.1.final_conv', + + # stage4 + # MaxPoolBlock reduce_channel_2x + 'model.45': 'backbone.stage4.0.maxpool_branches.1', + 'model.46': 'backbone.stage4.0.stride_conv_branches.0', + 'model.47': 'backbone.stage4.0.stride_conv_branches.1', + # ELANBlock no_change_channel + 'model.49': 'backbone.stage4.1.short_conv', + 'model.50': 'backbone.stage4.1.main_conv', + 'model.51': 'backbone.stage4.1.blocks.0.0', + 'model.52': 'backbone.stage4.1.blocks.0.1', + 'model.53': 'backbone.stage4.1.blocks.1.0', + 'model.54': 'backbone.stage4.1.blocks.1.1', + 'model.55': 'backbone.stage4.1.blocks.2.0', + 'model.56': 'backbone.stage4.1.blocks.2.1', + 'model.58': 'backbone.stage4.1.final_conv', + + # neck SPPCSPBlock + 'model.59.cv1': 'neck.reduce_layers.2.main_layers.0', + 'model.59.cv3': 'neck.reduce_layers.2.main_layers.1', + 'model.59.cv4': 'neck.reduce_layers.2.main_layers.2', + 'model.59.cv5': 'neck.reduce_layers.2.fuse_layers.0', + 'model.59.cv6': 'neck.reduce_layers.2.fuse_layers.1', + 'model.59.cv2': 'neck.reduce_layers.2.short_layer', + 'model.59.cv7': 'neck.reduce_layers.2.final_conv', + + # neck + 'model.60': 'neck.upsample_layers.0.0', + 'model.62': 'neck.reduce_layers.1', + + # neck ELANBlock reduce_channel_2x + 'model.64': 'neck.top_down_layers.0.short_conv', + 'model.65': 'neck.top_down_layers.0.main_conv', + 'model.66': 'neck.top_down_layers.0.blocks.0.0', + 'model.67': 'neck.top_down_layers.0.blocks.0.1', + 'model.68': 'neck.top_down_layers.0.blocks.1.0', + 'model.69': 'neck.top_down_layers.0.blocks.1.1', + 'model.70': 'neck.top_down_layers.0.blocks.2.0', + 'model.71': 'neck.top_down_layers.0.blocks.2.1', + 'model.73': 'neck.top_down_layers.0.final_conv', + 'model.74': 'neck.upsample_layers.1.0', + 'model.76': 'neck.reduce_layers.0', + + # neck ELANBlock reduce_channel_2x + 'model.78': 'neck.top_down_layers.1.short_conv', + 'model.79': 'neck.top_down_layers.1.main_conv', + 'model.80': 'neck.top_down_layers.1.blocks.0.0', + 'model.81': 'neck.top_down_layers.1.blocks.0.1', + 'model.82': 'neck.top_down_layers.1.blocks.1.0', + 'model.83': 'neck.top_down_layers.1.blocks.1.1', + 'model.84': 'neck.top_down_layers.1.blocks.2.0', + 'model.85': 'neck.top_down_layers.1.blocks.2.1', + 'model.87': 'neck.top_down_layers.1.final_conv', + + # neck MaxPoolBlock no_change_channel + 'model.89': 'neck.downsample_layers.0.maxpool_branches.1', + 'model.90': 'neck.downsample_layers.0.stride_conv_branches.0', + 'model.91': 'neck.downsample_layers.0.stride_conv_branches.1', + + # neck ELANBlock reduce_channel_2x + 'model.93': 'neck.bottom_up_layers.0.short_conv', + 'model.94': 'neck.bottom_up_layers.0.main_conv', + 'model.95': 'neck.bottom_up_layers.0.blocks.0.0', + 'model.96': 'neck.bottom_up_layers.0.blocks.0.1', + 'model.97': 'neck.bottom_up_layers.0.blocks.1.0', + 'model.98': 'neck.bottom_up_layers.0.blocks.1.1', + 'model.99': 'neck.bottom_up_layers.0.blocks.2.0', + 'model.100': 'neck.bottom_up_layers.0.blocks.2.1', + 'model.102': 'neck.bottom_up_layers.0.final_conv', + + # neck MaxPoolBlock no_change_channel + 'model.104': 'neck.downsample_layers.1.maxpool_branches.1', + 'model.105': 'neck.downsample_layers.1.stride_conv_branches.0', + 'model.106': 'neck.downsample_layers.1.stride_conv_branches.1', + + # neck ELANBlock reduce_channel_2x + 'model.108': 'neck.bottom_up_layers.1.short_conv', + 'model.109': 'neck.bottom_up_layers.1.main_conv', + 'model.110': 'neck.bottom_up_layers.1.blocks.0.0', + 'model.111': 'neck.bottom_up_layers.1.blocks.0.1', + 'model.112': 'neck.bottom_up_layers.1.blocks.1.0', + 'model.113': 'neck.bottom_up_layers.1.blocks.1.1', + 'model.114': 'neck.bottom_up_layers.1.blocks.2.0', + 'model.115': 'neck.bottom_up_layers.1.blocks.2.1', + 'model.117': 'neck.bottom_up_layers.1.final_conv', + + # Conv + 'model.118': 'neck.out_layers.0', + 'model.119': 'neck.out_layers.1', + 'model.120': 'neck.out_layers.2', + + # head + 'model.121.m.0': 'bbox_head.head_module.convs_pred.0.1', + 'model.121.m.1': 'bbox_head.head_module.convs_pred.1.1', + 'model.121.m.2': 'bbox_head.head_module.convs_pred.2.1' +} + +convert_dict_w = { + # stem + 'model.1': 'backbone.stem.conv', + + # stage1 + # ConvModule + 'model.2': 'backbone.stage1.0', + # ELANBlock + 'model.3': 'backbone.stage1.1.short_conv', + 'model.4': 'backbone.stage1.1.main_conv', + 'model.5': 'backbone.stage1.1.blocks.0.0', + 'model.6': 'backbone.stage1.1.blocks.0.1', + 'model.7': 'backbone.stage1.1.blocks.1.0', + 'model.8': 'backbone.stage1.1.blocks.1.1', + 'model.10': 'backbone.stage1.1.final_conv', + + # stage2 + 'model.11': 'backbone.stage2.0', + # ELANBlock + 'model.12': 'backbone.stage2.1.short_conv', + 'model.13': 'backbone.stage2.1.main_conv', + 'model.14': 'backbone.stage2.1.blocks.0.0', + 'model.15': 'backbone.stage2.1.blocks.0.1', + 'model.16': 'backbone.stage2.1.blocks.1.0', + 'model.17': 'backbone.stage2.1.blocks.1.1', + 'model.19': 'backbone.stage2.1.final_conv', + + # stage3 + 'model.20': 'backbone.stage3.0', + # ELANBlock + 'model.21': 'backbone.stage3.1.short_conv', + 'model.22': 'backbone.stage3.1.main_conv', + 'model.23': 'backbone.stage3.1.blocks.0.0', + 'model.24': 'backbone.stage3.1.blocks.0.1', + 'model.25': 'backbone.stage3.1.blocks.1.0', + 'model.26': 'backbone.stage3.1.blocks.1.1', + 'model.28': 'backbone.stage3.1.final_conv', + + # stage4 + 'model.29': 'backbone.stage4.0', + # ELANBlock + 'model.30': 'backbone.stage4.1.short_conv', + 'model.31': 'backbone.stage4.1.main_conv', + 'model.32': 'backbone.stage4.1.blocks.0.0', + 'model.33': 'backbone.stage4.1.blocks.0.1', + 'model.34': 'backbone.stage4.1.blocks.1.0', + 'model.35': 'backbone.stage4.1.blocks.1.1', + 'model.37': 'backbone.stage4.1.final_conv', + + # stage5 + 'model.38': 'backbone.stage5.0', + # ELANBlock + 'model.39': 'backbone.stage5.1.short_conv', + 'model.40': 'backbone.stage5.1.main_conv', + 'model.41': 'backbone.stage5.1.blocks.0.0', + 'model.42': 'backbone.stage5.1.blocks.0.1', + 'model.43': 'backbone.stage5.1.blocks.1.0', + 'model.44': 'backbone.stage5.1.blocks.1.1', + 'model.46': 'backbone.stage5.1.final_conv', + + # neck SPPCSPBlock + 'model.47.cv1': 'neck.reduce_layers.3.main_layers.0', + 'model.47.cv3': 'neck.reduce_layers.3.main_layers.1', + 'model.47.cv4': 'neck.reduce_layers.3.main_layers.2', + 'model.47.cv5': 'neck.reduce_layers.3.fuse_layers.0', + 'model.47.cv6': 'neck.reduce_layers.3.fuse_layers.1', + 'model.47.cv2': 'neck.reduce_layers.3.short_layer', + 'model.47.cv7': 'neck.reduce_layers.3.final_conv', + + # neck + 'model.48': 'neck.upsample_layers.0.0', + 'model.50': 'neck.reduce_layers.2', + + # neck ELANBlock + 'model.52': 'neck.top_down_layers.0.short_conv', + 'model.53': 'neck.top_down_layers.0.main_conv', + 'model.54': 'neck.top_down_layers.0.blocks.0', + 'model.55': 'neck.top_down_layers.0.blocks.1', + 'model.56': 'neck.top_down_layers.0.blocks.2', + 'model.57': 'neck.top_down_layers.0.blocks.3', + 'model.59': 'neck.top_down_layers.0.final_conv', + 'model.60': 'neck.upsample_layers.1.0', + 'model.62': 'neck.reduce_layers.1', + + # neck ELANBlock reduce_channel_2x + 'model.64': 'neck.top_down_layers.1.short_conv', + 'model.65': 'neck.top_down_layers.1.main_conv', + 'model.66': 'neck.top_down_layers.1.blocks.0', + 'model.67': 'neck.top_down_layers.1.blocks.1', + 'model.68': 'neck.top_down_layers.1.blocks.2', + 'model.69': 'neck.top_down_layers.1.blocks.3', + 'model.71': 'neck.top_down_layers.1.final_conv', + 'model.72': 'neck.upsample_layers.2.0', + 'model.74': 'neck.reduce_layers.0', + 'model.76': 'neck.top_down_layers.2.short_conv', + 'model.77': 'neck.top_down_layers.2.main_conv', + 'model.78': 'neck.top_down_layers.2.blocks.0', + 'model.79': 'neck.top_down_layers.2.blocks.1', + 'model.80': 'neck.top_down_layers.2.blocks.2', + 'model.81': 'neck.top_down_layers.2.blocks.3', + 'model.83': 'neck.top_down_layers.2.final_conv', + 'model.84': 'neck.downsample_layers.0', + + # neck ELANBlock + 'model.86': 'neck.bottom_up_layers.0.short_conv', + 'model.87': 'neck.bottom_up_layers.0.main_conv', + 'model.88': 'neck.bottom_up_layers.0.blocks.0', + 'model.89': 'neck.bottom_up_layers.0.blocks.1', + 'model.90': 'neck.bottom_up_layers.0.blocks.2', + 'model.91': 'neck.bottom_up_layers.0.blocks.3', + 'model.93': 'neck.bottom_up_layers.0.final_conv', + 'model.94': 'neck.downsample_layers.1', + + # neck ELANBlock reduce_channel_2x + 'model.96': 'neck.bottom_up_layers.1.short_conv', + 'model.97': 'neck.bottom_up_layers.1.main_conv', + 'model.98': 'neck.bottom_up_layers.1.blocks.0', + 'model.99': 'neck.bottom_up_layers.1.blocks.1', + 'model.100': 'neck.bottom_up_layers.1.blocks.2', + 'model.101': 'neck.bottom_up_layers.1.blocks.3', + 'model.103': 'neck.bottom_up_layers.1.final_conv', + 'model.104': 'neck.downsample_layers.2', + + # neck ELANBlock reduce_channel_2x + 'model.106': 'neck.bottom_up_layers.2.short_conv', + 'model.107': 'neck.bottom_up_layers.2.main_conv', + 'model.108': 'neck.bottom_up_layers.2.blocks.0', + 'model.109': 'neck.bottom_up_layers.2.blocks.1', + 'model.110': 'neck.bottom_up_layers.2.blocks.2', + 'model.111': 'neck.bottom_up_layers.2.blocks.3', + 'model.113': 'neck.bottom_up_layers.2.final_conv', + 'model.114': 'bbox_head.head_module.main_convs_pred.0.0', + 'model.115': 'bbox_head.head_module.main_convs_pred.1.0', + 'model.116': 'bbox_head.head_module.main_convs_pred.2.0', + 'model.117': 'bbox_head.head_module.main_convs_pred.3.0', + + # head + 'model.118.m.0': 'bbox_head.head_module.main_convs_pred.0.2', + 'model.118.m.1': 'bbox_head.head_module.main_convs_pred.1.2', + 'model.118.m.2': 'bbox_head.head_module.main_convs_pred.2.2', + 'model.118.m.3': 'bbox_head.head_module.main_convs_pred.3.2' +} + +convert_dict_e = { + # stem + 'model.1': 'backbone.stem.conv', + + # stage1 + 'model.2.cv1': 'backbone.stage1.0.stride_conv_branches.0', + 'model.2.cv2': 'backbone.stage1.0.stride_conv_branches.1', + 'model.2.cv3': 'backbone.stage1.0.maxpool_branches.1', + + # ELANBlock + 'model.3': 'backbone.stage1.1.short_conv', + 'model.4': 'backbone.stage1.1.main_conv', + 'model.5': 'backbone.stage1.1.blocks.0.0', + 'model.6': 'backbone.stage1.1.blocks.0.1', + 'model.7': 'backbone.stage1.1.blocks.1.0', + 'model.8': 'backbone.stage1.1.blocks.1.1', + 'model.9': 'backbone.stage1.1.blocks.2.0', + 'model.10': 'backbone.stage1.1.blocks.2.1', + 'model.12': 'backbone.stage1.1.final_conv', + + # stage2 + 'model.13.cv1': 'backbone.stage2.0.stride_conv_branches.0', + 'model.13.cv2': 'backbone.stage2.0.stride_conv_branches.1', + 'model.13.cv3': 'backbone.stage2.0.maxpool_branches.1', + + # ELANBlock + 'model.14': 'backbone.stage2.1.short_conv', + 'model.15': 'backbone.stage2.1.main_conv', + 'model.16': 'backbone.stage2.1.blocks.0.0', + 'model.17': 'backbone.stage2.1.blocks.0.1', + 'model.18': 'backbone.stage2.1.blocks.1.0', + 'model.19': 'backbone.stage2.1.blocks.1.1', + 'model.20': 'backbone.stage2.1.blocks.2.0', + 'model.21': 'backbone.stage2.1.blocks.2.1', + 'model.23': 'backbone.stage2.1.final_conv', + + # stage3 + 'model.24.cv1': 'backbone.stage3.0.stride_conv_branches.0', + 'model.24.cv2': 'backbone.stage3.0.stride_conv_branches.1', + 'model.24.cv3': 'backbone.stage3.0.maxpool_branches.1', + + # ELANBlock + 'model.25': 'backbone.stage3.1.short_conv', + 'model.26': 'backbone.stage3.1.main_conv', + 'model.27': 'backbone.stage3.1.blocks.0.0', + 'model.28': 'backbone.stage3.1.blocks.0.1', + 'model.29': 'backbone.stage3.1.blocks.1.0', + 'model.30': 'backbone.stage3.1.blocks.1.1', + 'model.31': 'backbone.stage3.1.blocks.2.0', + 'model.32': 'backbone.stage3.1.blocks.2.1', + 'model.34': 'backbone.stage3.1.final_conv', + + # stage4 + 'model.35.cv1': 'backbone.stage4.0.stride_conv_branches.0', + 'model.35.cv2': 'backbone.stage4.0.stride_conv_branches.1', + 'model.35.cv3': 'backbone.stage4.0.maxpool_branches.1', + + # ELANBlock + 'model.36': 'backbone.stage4.1.short_conv', + 'model.37': 'backbone.stage4.1.main_conv', + 'model.38': 'backbone.stage4.1.blocks.0.0', + 'model.39': 'backbone.stage4.1.blocks.0.1', + 'model.40': 'backbone.stage4.1.blocks.1.0', + 'model.41': 'backbone.stage4.1.blocks.1.1', + 'model.42': 'backbone.stage4.1.blocks.2.0', + 'model.43': 'backbone.stage4.1.blocks.2.1', + 'model.45': 'backbone.stage4.1.final_conv', + + # stage5 + 'model.46.cv1': 'backbone.stage5.0.stride_conv_branches.0', + 'model.46.cv2': 'backbone.stage5.0.stride_conv_branches.1', + 'model.46.cv3': 'backbone.stage5.0.maxpool_branches.1', + + # ELANBlock + 'model.47': 'backbone.stage5.1.short_conv', + 'model.48': 'backbone.stage5.1.main_conv', + 'model.49': 'backbone.stage5.1.blocks.0.0', + 'model.50': 'backbone.stage5.1.blocks.0.1', + 'model.51': 'backbone.stage5.1.blocks.1.0', + 'model.52': 'backbone.stage5.1.blocks.1.1', + 'model.53': 'backbone.stage5.1.blocks.2.0', + 'model.54': 'backbone.stage5.1.blocks.2.1', + 'model.56': 'backbone.stage5.1.final_conv', + + # neck SPPCSPBlock + 'model.57.cv1': 'neck.reduce_layers.3.main_layers.0', + 'model.57.cv3': 'neck.reduce_layers.3.main_layers.1', + 'model.57.cv4': 'neck.reduce_layers.3.main_layers.2', + 'model.57.cv5': 'neck.reduce_layers.3.fuse_layers.0', + 'model.57.cv6': 'neck.reduce_layers.3.fuse_layers.1', + 'model.57.cv2': 'neck.reduce_layers.3.short_layer', + 'model.57.cv7': 'neck.reduce_layers.3.final_conv', + + # neck + 'model.58': 'neck.upsample_layers.0.0', + 'model.60': 'neck.reduce_layers.2', + + # neck ELANBlock + 'model.62': 'neck.top_down_layers.0.short_conv', + 'model.63': 'neck.top_down_layers.0.main_conv', + 'model.64': 'neck.top_down_layers.0.blocks.0', + 'model.65': 'neck.top_down_layers.0.blocks.1', + 'model.66': 'neck.top_down_layers.0.blocks.2', + 'model.67': 'neck.top_down_layers.0.blocks.3', + 'model.68': 'neck.top_down_layers.0.blocks.4', + 'model.69': 'neck.top_down_layers.0.blocks.5', + 'model.71': 'neck.top_down_layers.0.final_conv', + 'model.72': 'neck.upsample_layers.1.0', + 'model.74': 'neck.reduce_layers.1', + + # neck ELANBlock + 'model.76': 'neck.top_down_layers.1.short_conv', + 'model.77': 'neck.top_down_layers.1.main_conv', + 'model.78': 'neck.top_down_layers.1.blocks.0', + 'model.79': 'neck.top_down_layers.1.blocks.1', + 'model.80': 'neck.top_down_layers.1.blocks.2', + 'model.81': 'neck.top_down_layers.1.blocks.3', + 'model.82': 'neck.top_down_layers.1.blocks.4', + 'model.83': 'neck.top_down_layers.1.blocks.5', + 'model.85': 'neck.top_down_layers.1.final_conv', + 'model.86': 'neck.upsample_layers.2.0', + 'model.88': 'neck.reduce_layers.0', + 'model.90': 'neck.top_down_layers.2.short_conv', + 'model.91': 'neck.top_down_layers.2.main_conv', + 'model.92': 'neck.top_down_layers.2.blocks.0', + 'model.93': 'neck.top_down_layers.2.blocks.1', + 'model.94': 'neck.top_down_layers.2.blocks.2', + 'model.95': 'neck.top_down_layers.2.blocks.3', + 'model.96': 'neck.top_down_layers.2.blocks.4', + 'model.97': 'neck.top_down_layers.2.blocks.5', + 'model.99': 'neck.top_down_layers.2.final_conv', + 'model.100.cv1': 'neck.downsample_layers.0.stride_conv_branches.0', + 'model.100.cv2': 'neck.downsample_layers.0.stride_conv_branches.1', + 'model.100.cv3': 'neck.downsample_layers.0.maxpool_branches.1', + + # neck ELANBlock + 'model.102': 'neck.bottom_up_layers.0.short_conv', + 'model.103': 'neck.bottom_up_layers.0.main_conv', + 'model.104': 'neck.bottom_up_layers.0.blocks.0', + 'model.105': 'neck.bottom_up_layers.0.blocks.1', + 'model.106': 'neck.bottom_up_layers.0.blocks.2', + 'model.107': 'neck.bottom_up_layers.0.blocks.3', + 'model.108': 'neck.bottom_up_layers.0.blocks.4', + 'model.109': 'neck.bottom_up_layers.0.blocks.5', + 'model.111': 'neck.bottom_up_layers.0.final_conv', + 'model.112.cv1': 'neck.downsample_layers.1.stride_conv_branches.0', + 'model.112.cv2': 'neck.downsample_layers.1.stride_conv_branches.1', + 'model.112.cv3': 'neck.downsample_layers.1.maxpool_branches.1', + + # neck ELANBlock + 'model.114': 'neck.bottom_up_layers.1.short_conv', + 'model.115': 'neck.bottom_up_layers.1.main_conv', + 'model.116': 'neck.bottom_up_layers.1.blocks.0', + 'model.117': 'neck.bottom_up_layers.1.blocks.1', + 'model.118': 'neck.bottom_up_layers.1.blocks.2', + 'model.119': 'neck.bottom_up_layers.1.blocks.3', + 'model.120': 'neck.bottom_up_layers.1.blocks.4', + 'model.121': 'neck.bottom_up_layers.1.blocks.5', + 'model.123': 'neck.bottom_up_layers.1.final_conv', + 'model.124.cv1': 'neck.downsample_layers.2.stride_conv_branches.0', + 'model.124.cv2': 'neck.downsample_layers.2.stride_conv_branches.1', + 'model.124.cv3': 'neck.downsample_layers.2.maxpool_branches.1', + + # neck ELANBlock + 'model.126': 'neck.bottom_up_layers.2.short_conv', + 'model.127': 'neck.bottom_up_layers.2.main_conv', + 'model.128': 'neck.bottom_up_layers.2.blocks.0', + 'model.129': 'neck.bottom_up_layers.2.blocks.1', + 'model.130': 'neck.bottom_up_layers.2.blocks.2', + 'model.131': 'neck.bottom_up_layers.2.blocks.3', + 'model.132': 'neck.bottom_up_layers.2.blocks.4', + 'model.133': 'neck.bottom_up_layers.2.blocks.5', + 'model.135': 'neck.bottom_up_layers.2.final_conv', + 'model.136': 'bbox_head.head_module.main_convs_pred.0.0', + 'model.137': 'bbox_head.head_module.main_convs_pred.1.0', + 'model.138': 'bbox_head.head_module.main_convs_pred.2.0', + 'model.139': 'bbox_head.head_module.main_convs_pred.3.0', + + # head + 'model.140.m.0': 'bbox_head.head_module.main_convs_pred.0.2', + 'model.140.m.1': 'bbox_head.head_module.main_convs_pred.1.2', + 'model.140.m.2': 'bbox_head.head_module.main_convs_pred.2.2', + 'model.140.m.3': 'bbox_head.head_module.main_convs_pred.3.2' +} + +convert_dict_e2e = { + # stem + 'model.1': 'backbone.stem.conv', + + # stage1 + 'model.2.cv1': 'backbone.stage1.0.stride_conv_branches.0', + 'model.2.cv2': 'backbone.stage1.0.stride_conv_branches.1', + 'model.2.cv3': 'backbone.stage1.0.maxpool_branches.1', + + # E-ELANBlock + 'model.3': 'backbone.stage1.1.e_elan_blocks.0.short_conv', + 'model.4': 'backbone.stage1.1.e_elan_blocks.0.main_conv', + 'model.5': 'backbone.stage1.1.e_elan_blocks.0.blocks.0.0', + 'model.6': 'backbone.stage1.1.e_elan_blocks.0.blocks.0.1', + 'model.7': 'backbone.stage1.1.e_elan_blocks.0.blocks.1.0', + 'model.8': 'backbone.stage1.1.e_elan_blocks.0.blocks.1.1', + 'model.9': 'backbone.stage1.1.e_elan_blocks.0.blocks.2.0', + 'model.10': 'backbone.stage1.1.e_elan_blocks.0.blocks.2.1', + 'model.12': 'backbone.stage1.1.e_elan_blocks.0.final_conv', + 'model.13': 'backbone.stage1.1.e_elan_blocks.1.short_conv', + 'model.14': 'backbone.stage1.1.e_elan_blocks.1.main_conv', + 'model.15': 'backbone.stage1.1.e_elan_blocks.1.blocks.0.0', + 'model.16': 'backbone.stage1.1.e_elan_blocks.1.blocks.0.1', + 'model.17': 'backbone.stage1.1.e_elan_blocks.1.blocks.1.0', + 'model.18': 'backbone.stage1.1.e_elan_blocks.1.blocks.1.1', + 'model.19': 'backbone.stage1.1.e_elan_blocks.1.blocks.2.0', + 'model.20': 'backbone.stage1.1.e_elan_blocks.1.blocks.2.1', + 'model.22': 'backbone.stage1.1.e_elan_blocks.1.final_conv', + + # stage2 + 'model.24.cv1': 'backbone.stage2.0.stride_conv_branches.0', + 'model.24.cv2': 'backbone.stage2.0.stride_conv_branches.1', + 'model.24.cv3': 'backbone.stage2.0.maxpool_branches.1', + + # E-ELANBlock + 'model.25': 'backbone.stage2.1.e_elan_blocks.0.short_conv', + 'model.26': 'backbone.stage2.1.e_elan_blocks.0.main_conv', + 'model.27': 'backbone.stage2.1.e_elan_blocks.0.blocks.0.0', + 'model.28': 'backbone.stage2.1.e_elan_blocks.0.blocks.0.1', + 'model.29': 'backbone.stage2.1.e_elan_blocks.0.blocks.1.0', + 'model.30': 'backbone.stage2.1.e_elan_blocks.0.blocks.1.1', + 'model.31': 'backbone.stage2.1.e_elan_blocks.0.blocks.2.0', + 'model.32': 'backbone.stage2.1.e_elan_blocks.0.blocks.2.1', + 'model.34': 'backbone.stage2.1.e_elan_blocks.0.final_conv', + 'model.35': 'backbone.stage2.1.e_elan_blocks.1.short_conv', + 'model.36': 'backbone.stage2.1.e_elan_blocks.1.main_conv', + 'model.37': 'backbone.stage2.1.e_elan_blocks.1.blocks.0.0', + 'model.38': 'backbone.stage2.1.e_elan_blocks.1.blocks.0.1', + 'model.39': 'backbone.stage2.1.e_elan_blocks.1.blocks.1.0', + 'model.40': 'backbone.stage2.1.e_elan_blocks.1.blocks.1.1', + 'model.41': 'backbone.stage2.1.e_elan_blocks.1.blocks.2.0', + 'model.42': 'backbone.stage2.1.e_elan_blocks.1.blocks.2.1', + 'model.44': 'backbone.stage2.1.e_elan_blocks.1.final_conv', + + # stage3 + 'model.46.cv1': 'backbone.stage3.0.stride_conv_branches.0', + 'model.46.cv2': 'backbone.stage3.0.stride_conv_branches.1', + 'model.46.cv3': 'backbone.stage3.0.maxpool_branches.1', + + # E-ELANBlock + 'model.47': 'backbone.stage3.1.e_elan_blocks.0.short_conv', + 'model.48': 'backbone.stage3.1.e_elan_blocks.0.main_conv', + 'model.49': 'backbone.stage3.1.e_elan_blocks.0.blocks.0.0', + 'model.50': 'backbone.stage3.1.e_elan_blocks.0.blocks.0.1', + 'model.51': 'backbone.stage3.1.e_elan_blocks.0.blocks.1.0', + 'model.52': 'backbone.stage3.1.e_elan_blocks.0.blocks.1.1', + 'model.53': 'backbone.stage3.1.e_elan_blocks.0.blocks.2.0', + 'model.54': 'backbone.stage3.1.e_elan_blocks.0.blocks.2.1', + 'model.56': 'backbone.stage3.1.e_elan_blocks.0.final_conv', + 'model.57': 'backbone.stage3.1.e_elan_blocks.1.short_conv', + 'model.58': 'backbone.stage3.1.e_elan_blocks.1.main_conv', + 'model.59': 'backbone.stage3.1.e_elan_blocks.1.blocks.0.0', + 'model.60': 'backbone.stage3.1.e_elan_blocks.1.blocks.0.1', + 'model.61': 'backbone.stage3.1.e_elan_blocks.1.blocks.1.0', + 'model.62': 'backbone.stage3.1.e_elan_blocks.1.blocks.1.1', + 'model.63': 'backbone.stage3.1.e_elan_blocks.1.blocks.2.0', + 'model.64': 'backbone.stage3.1.e_elan_blocks.1.blocks.2.1', + 'model.66': 'backbone.stage3.1.e_elan_blocks.1.final_conv', + + # stage4 + 'model.68.cv1': 'backbone.stage4.0.stride_conv_branches.0', + 'model.68.cv2': 'backbone.stage4.0.stride_conv_branches.1', + 'model.68.cv3': 'backbone.stage4.0.maxpool_branches.1', + + # E-ELANBlock + 'model.69': 'backbone.stage4.1.e_elan_blocks.0.short_conv', + 'model.70': 'backbone.stage4.1.e_elan_blocks.0.main_conv', + 'model.71': 'backbone.stage4.1.e_elan_blocks.0.blocks.0.0', + 'model.72': 'backbone.stage4.1.e_elan_blocks.0.blocks.0.1', + 'model.73': 'backbone.stage4.1.e_elan_blocks.0.blocks.1.0', + 'model.74': 'backbone.stage4.1.e_elan_blocks.0.blocks.1.1', + 'model.75': 'backbone.stage4.1.e_elan_blocks.0.blocks.2.0', + 'model.76': 'backbone.stage4.1.e_elan_blocks.0.blocks.2.1', + 'model.78': 'backbone.stage4.1.e_elan_blocks.0.final_conv', + 'model.79': 'backbone.stage4.1.e_elan_blocks.1.short_conv', + 'model.80': 'backbone.stage4.1.e_elan_blocks.1.main_conv', + 'model.81': 'backbone.stage4.1.e_elan_blocks.1.blocks.0.0', + 'model.82': 'backbone.stage4.1.e_elan_blocks.1.blocks.0.1', + 'model.83': 'backbone.stage4.1.e_elan_blocks.1.blocks.1.0', + 'model.84': 'backbone.stage4.1.e_elan_blocks.1.blocks.1.1', + 'model.85': 'backbone.stage4.1.e_elan_blocks.1.blocks.2.0', + 'model.86': 'backbone.stage4.1.e_elan_blocks.1.blocks.2.1', + 'model.88': 'backbone.stage4.1.e_elan_blocks.1.final_conv', + + # stage5 + 'model.90.cv1': 'backbone.stage5.0.stride_conv_branches.0', + 'model.90.cv2': 'backbone.stage5.0.stride_conv_branches.1', + 'model.90.cv3': 'backbone.stage5.0.maxpool_branches.1', + + # E-ELANBlock + 'model.91': 'backbone.stage5.1.e_elan_blocks.0.short_conv', + 'model.92': 'backbone.stage5.1.e_elan_blocks.0.main_conv', + 'model.93': 'backbone.stage5.1.e_elan_blocks.0.blocks.0.0', + 'model.94': 'backbone.stage5.1.e_elan_blocks.0.blocks.0.1', + 'model.95': 'backbone.stage5.1.e_elan_blocks.0.blocks.1.0', + 'model.96': 'backbone.stage5.1.e_elan_blocks.0.blocks.1.1', + 'model.97': 'backbone.stage5.1.e_elan_blocks.0.blocks.2.0', + 'model.98': 'backbone.stage5.1.e_elan_blocks.0.blocks.2.1', + 'model.100': 'backbone.stage5.1.e_elan_blocks.0.final_conv', + 'model.101': 'backbone.stage5.1.e_elan_blocks.1.short_conv', + 'model.102': 'backbone.stage5.1.e_elan_blocks.1.main_conv', + 'model.103': 'backbone.stage5.1.e_elan_blocks.1.blocks.0.0', + 'model.104': 'backbone.stage5.1.e_elan_blocks.1.blocks.0.1', + 'model.105': 'backbone.stage5.1.e_elan_blocks.1.blocks.1.0', + 'model.106': 'backbone.stage5.1.e_elan_blocks.1.blocks.1.1', + 'model.107': 'backbone.stage5.1.e_elan_blocks.1.blocks.2.0', + 'model.108': 'backbone.stage5.1.e_elan_blocks.1.blocks.2.1', + 'model.110': 'backbone.stage5.1.e_elan_blocks.1.final_conv', + + # neck SPPCSPBlock + 'model.112.cv1': 'neck.reduce_layers.3.main_layers.0', + 'model.112.cv3': 'neck.reduce_layers.3.main_layers.1', + 'model.112.cv4': 'neck.reduce_layers.3.main_layers.2', + 'model.112.cv5': 'neck.reduce_layers.3.fuse_layers.0', + 'model.112.cv6': 'neck.reduce_layers.3.fuse_layers.1', + 'model.112.cv2': 'neck.reduce_layers.3.short_layer', + 'model.112.cv7': 'neck.reduce_layers.3.final_conv', + + # neck + 'model.113': 'neck.upsample_layers.0.0', + 'model.115': 'neck.reduce_layers.2', + + # neck E-ELANBlock + 'model.117': 'neck.top_down_layers.0.e_elan_blocks.0.short_conv', + 'model.118': 'neck.top_down_layers.0.e_elan_blocks.0.main_conv', + 'model.119': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.0', + 'model.120': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.1', + 'model.121': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.2', + 'model.122': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.3', + 'model.123': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.4', + 'model.124': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.5', + 'model.126': 'neck.top_down_layers.0.e_elan_blocks.0.final_conv', + 'model.127': 'neck.top_down_layers.0.e_elan_blocks.1.short_conv', + 'model.128': 'neck.top_down_layers.0.e_elan_blocks.1.main_conv', + 'model.129': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.0', + 'model.130': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.1', + 'model.131': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.2', + 'model.132': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.3', + 'model.133': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.4', + 'model.134': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.5', + 'model.136': 'neck.top_down_layers.0.e_elan_blocks.1.final_conv', + 'model.138': 'neck.upsample_layers.1.0', + 'model.140': 'neck.reduce_layers.1', + + # neck E-ELANBlock + 'model.142': 'neck.top_down_layers.1.e_elan_blocks.0.short_conv', + 'model.143': 'neck.top_down_layers.1.e_elan_blocks.0.main_conv', + 'model.144': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.0', + 'model.145': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.1', + 'model.146': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.2', + 'model.147': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.3', + 'model.148': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.4', + 'model.149': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.5', + 'model.151': 'neck.top_down_layers.1.e_elan_blocks.0.final_conv', + 'model.152': 'neck.top_down_layers.1.e_elan_blocks.1.short_conv', + 'model.153': 'neck.top_down_layers.1.e_elan_blocks.1.main_conv', + 'model.154': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.0', + 'model.155': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.1', + 'model.156': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.2', + 'model.157': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.3', + 'model.158': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.4', + 'model.159': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.5', + 'model.161': 'neck.top_down_layers.1.e_elan_blocks.1.final_conv', + 'model.163': 'neck.upsample_layers.2.0', + 'model.165': 'neck.reduce_layers.0', + 'model.167': 'neck.top_down_layers.2.e_elan_blocks.0.short_conv', + 'model.168': 'neck.top_down_layers.2.e_elan_blocks.0.main_conv', + 'model.169': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.0', + 'model.170': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.1', + 'model.171': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.2', + 'model.172': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.3', + 'model.173': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.4', + 'model.174': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.5', + 'model.176': 'neck.top_down_layers.2.e_elan_blocks.0.final_conv', + 'model.177': 'neck.top_down_layers.2.e_elan_blocks.1.short_conv', + 'model.178': 'neck.top_down_layers.2.e_elan_blocks.1.main_conv', + 'model.179': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.0', + 'model.180': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.1', + 'model.181': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.2', + 'model.182': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.3', + 'model.183': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.4', + 'model.184': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.5', + 'model.186': 'neck.top_down_layers.2.e_elan_blocks.1.final_conv', + 'model.188.cv1': 'neck.downsample_layers.0.stride_conv_branches.0', + 'model.188.cv2': 'neck.downsample_layers.0.stride_conv_branches.1', + 'model.188.cv3': 'neck.downsample_layers.0.maxpool_branches.1', + + # neck E-ELANBlock + 'model.190': 'neck.bottom_up_layers.0.e_elan_blocks.0.short_conv', + 'model.191': 'neck.bottom_up_layers.0.e_elan_blocks.0.main_conv', + 'model.192': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.0', + 'model.193': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.1', + 'model.194': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.2', + 'model.195': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.3', + 'model.196': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.4', + 'model.197': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.5', + 'model.199': 'neck.bottom_up_layers.0.e_elan_blocks.0.final_conv', + 'model.200': 'neck.bottom_up_layers.0.e_elan_blocks.1.short_conv', + 'model.201': 'neck.bottom_up_layers.0.e_elan_blocks.1.main_conv', + 'model.202': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.0', + 'model.203': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.1', + 'model.204': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.2', + 'model.205': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.3', + 'model.206': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.4', + 'model.207': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.5', + 'model.209': 'neck.bottom_up_layers.0.e_elan_blocks.1.final_conv', + 'model.211.cv1': 'neck.downsample_layers.1.stride_conv_branches.0', + 'model.211.cv2': 'neck.downsample_layers.1.stride_conv_branches.1', + 'model.211.cv3': 'neck.downsample_layers.1.maxpool_branches.1', + 'model.213': 'neck.bottom_up_layers.1.e_elan_blocks.0.short_conv', + 'model.214': 'neck.bottom_up_layers.1.e_elan_blocks.0.main_conv', + 'model.215': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.0', + 'model.216': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.1', + 'model.217': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.2', + 'model.218': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.3', + 'model.219': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.4', + 'model.220': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.5', + 'model.222': 'neck.bottom_up_layers.1.e_elan_blocks.0.final_conv', + 'model.223': 'neck.bottom_up_layers.1.e_elan_blocks.1.short_conv', + 'model.224': 'neck.bottom_up_layers.1.e_elan_blocks.1.main_conv', + 'model.225': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.0', + 'model.226': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.1', + 'model.227': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.2', + 'model.228': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.3', + 'model.229': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.4', + 'model.230': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.5', + 'model.232': 'neck.bottom_up_layers.1.e_elan_blocks.1.final_conv', + 'model.234.cv1': 'neck.downsample_layers.2.stride_conv_branches.0', + 'model.234.cv2': 'neck.downsample_layers.2.stride_conv_branches.1', + 'model.234.cv3': 'neck.downsample_layers.2.maxpool_branches.1', + + # neck E-ELANBlock + 'model.236': 'neck.bottom_up_layers.2.e_elan_blocks.0.short_conv', + 'model.237': 'neck.bottom_up_layers.2.e_elan_blocks.0.main_conv', + 'model.238': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.0', + 'model.239': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.1', + 'model.240': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.2', + 'model.241': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.3', + 'model.242': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.4', + 'model.243': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.5', + 'model.245': 'neck.bottom_up_layers.2.e_elan_blocks.0.final_conv', + 'model.246': 'neck.bottom_up_layers.2.e_elan_blocks.1.short_conv', + 'model.247': 'neck.bottom_up_layers.2.e_elan_blocks.1.main_conv', + 'model.248': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.0', + 'model.249': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.1', + 'model.250': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.2', + 'model.251': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.3', + 'model.252': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.4', + 'model.253': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.5', + 'model.255': 'neck.bottom_up_layers.2.e_elan_blocks.1.final_conv', + 'model.257': 'bbox_head.head_module.main_convs_pred.0.0', + 'model.258': 'bbox_head.head_module.main_convs_pred.1.0', + 'model.259': 'bbox_head.head_module.main_convs_pred.2.0', + 'model.260': 'bbox_head.head_module.main_convs_pred.3.0', + + # head + 'model.261.m.0': 'bbox_head.head_module.main_convs_pred.0.2', + 'model.261.m.1': 'bbox_head.head_module.main_convs_pred.1.2', + 'model.261.m.2': 'bbox_head.head_module.main_convs_pred.2.2', + 'model.261.m.3': 'bbox_head.head_module.main_convs_pred.3.2' +} + +convert_dicts = { + 'yolov7-tiny.pt': convert_dict_tiny, + 'yolov7-w6.pt': convert_dict_w, + 'yolov7-e6.pt': convert_dict_e, + 'yolov7-e6e.pt': convert_dict_e2e, + 'yolov7.pt': convert_dict_l, + 'yolov7x.pt': convert_dict_x +} + + +def convert(src, dst): + src_key = osp.basename(src) + convert_dict = convert_dicts[osp.basename(src)] + + num_levels = 3 + if src_key == 'yolov7.pt': + indexes = [102, 51] + in_channels = [256, 512, 1024] + elif src_key == 'yolov7x.pt': + indexes = [121, 59] + in_channels = [320, 640, 1280] + elif src_key == 'yolov7-tiny.pt': + indexes = [77, 1000] + in_channels = [128, 256, 512] + elif src_key == 'yolov7-w6.pt': + indexes = [118, 47] + in_channels = [256, 512, 768, 1024] + num_levels = 4 + elif src_key == 'yolov7-e6.pt': + indexes = [140, [2, 13, 24, 35, 46, 57, 100, 112, 124]] + in_channels = 320, 640, 960, 1280 + num_levels = 4 + elif src_key == 'yolov7-e6e.pt': + indexes = [261, [2, 24, 46, 68, 90, 112, 188, 211, 234]] + in_channels = 320, 640, 960, 1280 + num_levels = 4 + + if isinstance(indexes[1], int): + indexes[1] = [indexes[1]] + """Convert keys in detectron pretrained YOLOv7 models to mmyolo style.""" + try: + yolov7_model = torch.load(src)['model'].float() + blobs = yolov7_model.state_dict() + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the WongKinYiu/yolov7 repo,' + ' because loading the official pretrained model need' + ' `model.py` to build model.') + state_dict = OrderedDict() + + for key, weight in blobs.items(): + if key.find('anchors') >= 0 or key.find('anchor_grid') >= 0: + continue + + num, module = key.split('.')[1:3] + if int(num) < indexes[0] and int(num) not in indexes[1]: + prefix = f'model.{num}' + new_key = key.replace(prefix, convert_dict[prefix]) + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + elif int(num) in indexes[1]: + strs_key = key.split('.')[:3] + new_key = key.replace('.'.join(strs_key), + convert_dict['.'.join(strs_key)]) + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + else: + strs_key = key.split('.')[:4] + new_key = key.replace('.'.join(strs_key), + convert_dict['.'.join(strs_key)]) + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # Add ImplicitA and ImplicitM + for i in range(num_levels): + if num_levels == 3: + implicit_a = f'bbox_head.head_module.' \ + f'convs_pred.{i}.0.implicit' + state_dict[implicit_a] = torch.zeros((1, in_channels[i], 1, 1)) + implicit_m = f'bbox_head.head_module.' \ + f'convs_pred.{i}.2.implicit' + state_dict[implicit_m] = torch.ones((1, 3 * 85, 1, 1)) + else: + implicit_a = f'bbox_head.head_module.' \ + f'main_convs_pred.{i}.1.implicit' + state_dict[implicit_a] = torch.zeros((1, in_channels[i], 1, 1)) + implicit_m = f'bbox_head.head_module.' \ + f'main_convs_pred.{i}.3.implicit' + state_dict[implicit_m] = torch.ones((1, 3 * 85, 1, 1)) + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +# Note: This script must be placed under the yolov7 repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + 'src', default='yolov7.pt', help='src yolov7 model path') + parser.add_argument('dst', default='mm_yolov7l.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + print('If your model weights are from P6 models, such as W6, E6, D6, \ + E6E, the auxiliary training module is not required to be loaded, \ + so it is normal for the weights of the auxiliary module \ + to be missing.') + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov8_to_mmyolo.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov8_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..4ed64f2492ba0bece874c482fe704492fad4e8e9 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolov8_to_mmyolo.py @@ -0,0 +1,102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + +convert_dict_s = { + # backbone + 'model.0': 'backbone.stem', + 'model.1': 'backbone.stage1.0', + 'model.2': 'backbone.stage1.1', + 'model.3': 'backbone.stage2.0', + 'model.4': 'backbone.stage2.1', + 'model.5': 'backbone.stage3.0', + 'model.6': 'backbone.stage3.1', + 'model.7': 'backbone.stage4.0', + 'model.8': 'backbone.stage4.1', + 'model.9': 'backbone.stage4.2', + + # neck + 'model.12': 'neck.top_down_layers.0', + 'model.15': 'neck.top_down_layers.1', + 'model.16': 'neck.downsample_layers.0', + 'model.18': 'neck.bottom_up_layers.0', + 'model.19': 'neck.downsample_layers.1', + 'model.21': 'neck.bottom_up_layers.1', + + # Detector + 'model.22': 'bbox_head.head_module', +} + + +def convert(src, dst): + """Convert keys in pretrained YOLOv8 models to mmyolo style.""" + convert_dict = convert_dict_s + + try: + yolov8_model = torch.load(src)['model'] + blobs = yolov8_model.state_dict() + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the ultralytics repo,' + ' because loading the official pretrained model need' + ' `model.py` to build model.' + 'Also need to install hydra-core>=1.2.0 and thop>=0.1.1') + state_dict = OrderedDict() + + for key, weight in blobs.items(): + num, module = key.split('.')[1:3] + prefix = f'model.{num}' + new_key = key.replace(prefix, convert_dict[prefix]) + + if '.m.' in new_key: + new_key = new_key.replace('.m.', '.blocks.') + new_key = new_key.replace('.cv', '.conv') + elif 'bbox_head.head_module.proto.cv' in new_key: + new_key = new_key.replace( + 'bbox_head.head_module.proto.cv', + 'bbox_head.head_module.proto_preds.conv') + elif 'bbox_head.head_module.proto' in new_key: + new_key = new_key.replace('bbox_head.head_module.proto', + 'bbox_head.head_module.proto_preds') + elif 'bbox_head.head_module.cv4.' in new_key: + new_key = new_key.replace( + 'bbox_head.head_module.cv4', + 'bbox_head.head_module.mask_coeff_preds') + new_key = new_key.replace('.2.weight', '.2.conv.weight') + new_key = new_key.replace('.2.bias', '.2.conv.bias') + elif 'bbox_head.head_module' in new_key: + new_key = new_key.replace('.cv2', '.reg_preds') + new_key = new_key.replace('.cv3', '.cls_preds') + elif 'backbone.stage4.2' in new_key: + new_key = new_key.replace('.cv', '.conv') + else: + new_key = new_key.replace('.cv1', '.main_conv') + new_key = new_key.replace('.cv2', '.final_conv') + + if 'bbox_head.head_module.dfl.conv.weight' == new_key: + print('Drop "bbox_head.head_module.dfl.conv.weight", ' + 'because it is useless') + continue + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +# Note: This script must be placed under the ultralytics repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov8s.pt', help='src YOLOv8 model path') + parser.add_argument('--dst', default='mmyolov8s.pth', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolox_to_mmyolo.py b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolox_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..5fcc7356780444db59517c931ce1a3557ec8340a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/model_converters/yolox_to_mmyolo.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + +neck_dict = { + 'backbone.lateral_conv0': 'neck.reduce_layers.2', + 'backbone.C3_p4.conv': 'neck.top_down_layers.0.0.cv', + 'backbone.C3_p4.m.0.': 'neck.top_down_layers.0.0.m.0.', + 'backbone.reduce_conv1': 'neck.top_down_layers.0.1', + 'backbone.C3_p3.conv': 'neck.top_down_layers.1.cv', + 'backbone.C3_p3.m.0.': 'neck.top_down_layers.1.m.0.', + 'backbone.bu_conv2': 'neck.downsample_layers.0', + 'backbone.C3_n3.conv': 'neck.bottom_up_layers.0.cv', + 'backbone.C3_n3.m.0.': 'neck.bottom_up_layers.0.m.0.', + 'backbone.bu_conv1': 'neck.downsample_layers.1', + 'backbone.C3_n4.conv': 'neck.bottom_up_layers.1.cv', + 'backbone.C3_n4.m.0.': 'neck.bottom_up_layers.1.m.0.', +} + + +def convert_stem(model_key, model_weight, state_dict, converted_names): + new_key = model_key[9:] + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_backbone(model_key, model_weight, state_dict, converted_names): + new_key = model_key.replace('backbone.dark', 'stage') + num = int(new_key[14]) - 1 + new_key = new_key[:14] + str(num) + new_key[15:] + if '.m.' in model_key: + new_key = new_key.replace('.m.', '.blocks.') + elif not new_key[16] == '0' and 'stage4.1' not in new_key: + new_key = new_key.replace('conv1', 'main_conv') + new_key = new_key.replace('conv2', 'short_conv') + new_key = new_key.replace('conv3', 'final_conv') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_neck(model_key, model_weight, state_dict, converted_names): + for old, new in neck_dict.items(): + if old in model_key: + new_key = model_key.replace(old, new) + if '.m.' in model_key: + new_key = new_key.replace('.m.', '.blocks.') + elif '.C' in model_key: + new_key = new_key.replace('cv1', 'main_conv') + new_key = new_key.replace('cv2', 'short_conv') + new_key = new_key.replace('cv3', 'final_conv') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_head(model_key, model_weight, state_dict, converted_names): + if 'stem' in model_key: + new_key = model_key.replace('head.stem', 'neck.out_layer') + elif 'cls_convs' in model_key: + new_key = model_key.replace( + 'head.cls_convs', 'bbox_head.head_module.multi_level_cls_convs') + elif 'reg_convs' in model_key: + new_key = model_key.replace( + 'head.reg_convs', 'bbox_head.head_module.multi_level_reg_convs') + elif 'preds' in model_key: + new_key = model_key.replace('head.', + 'bbox_head.head_module.multi_level_conv_') + new_key = new_key.replace('_preds', '') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert(src, dst): + """Convert keys in detectron pretrained YOLOX models to mmyolo style.""" + blobs = torch.load(src)['model'] + state_dict = OrderedDict() + converted_names = set() + + for key, weight in blobs.items(): + if 'backbone.stem' in key: + convert_stem(key, weight, state_dict, converted_names) + elif 'backbone.backbone' in key: + convert_backbone(key, weight, state_dict, converted_names) + elif 'backbone.neck' not in key and 'head' not in key: + convert_neck(key, weight, state_dict, converted_names) + elif 'head' in key: + convert_head(key, weight, state_dict, converted_names) + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolox_s.pth', help='src yolox model path') + parser.add_argument('--dst', default='mmyoloxs.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/slurm_test.sh b/models/YOLO-World/third_party/mmyolo/tools/slurm_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..6dd67e57442b741fc30f26102eb5afe16139edb1 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/slurm_test.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +CHECKPOINT=$4 +GPUS=${GPUS:-8} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +PY_ARGS=${@:5} +SRUN_ARGS=${SRUN_ARGS:-""} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} diff --git a/models/YOLO-World/third_party/mmyolo/tools/slurm_train.sh b/models/YOLO-World/third_party/mmyolo/tools/slurm_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..b3feb3d9c7a6c33d82739cdf5ee10365673aaded --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/slurm_train.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +WORK_DIR=$4 +GPUS=${GPUS:-8} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +SRUN_ARGS=${SRUN_ARGS:-""} +PY_ARGS=${@:5} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} diff --git a/models/YOLO-World/third_party/mmyolo/tools/test.py b/models/YOLO-World/third_party/mmyolo/tools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ac8bde429c946ec18c7f29ea8d7cbad102e262 --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/test.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +from mmdet.engine.hooks.utils import trigger_visualization_hook +from mmdet.utils import setup_cache_size_limit_of_dynamo +from mmengine.config import Config, ConfigDict, DictAction +from mmengine.evaluator import DumpResults +from mmengine.runner import Runner + +from mmyolo.registry import RUNNERS +from mmyolo.utils import is_metainfo_lower + + +# TODO: support fuse_conv_bn +def parse_args(): + parser = argparse.ArgumentParser( + description='MMYOLO test (and eval) a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing evaluation metrics') + parser.add_argument( + '--out', + type=str, + help='output result file (must be a .pkl file) in pickle format') + parser.add_argument( + '--json-prefix', + type=str, + help='the prefix of the output json file without perform evaluation, ' + 'which is useful when you want to format the result to a specific ' + 'format and submit it to the test server') + parser.add_argument( + '--tta', + action='store_true', + help='Whether to use test time augmentation') + parser.add_argument( + '--show', action='store_true', help='show prediction results') + parser.add_argument( + '--deploy', + action='store_true', + help='Switch model to deployment mode') + parser.add_argument( + '--show-dir', + help='directory where painted images will be saved. ' + 'If specified, it will be automatically saved ' + 'to the work_dir/timestamp/show_dir') + parser.add_argument( + '--wait-time', type=float, default=2, help='the interval of show (s)') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` + # will pass the `--local-rank` parameter to `tools/train.py` instead + # of `--local_rank`. + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def main(): + args = parse_args() + + # Reduce the number of repeated compilations and improve + # training speed. + setup_cache_size_limit_of_dynamo() + + # load config + cfg = Config.fromfile(args.config) + # replace the ${key} with the value of cfg.key + # cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + + if args.show or args.show_dir: + cfg = trigger_visualization_hook(cfg, args) + + if args.deploy: + cfg.custom_hooks.append(dict(type='SwitchToDeployHook')) + + # add `format_only` and `outfile_prefix` into cfg + if args.json_prefix is not None: + cfg_json = { + 'test_evaluator.format_only': True, + 'test_evaluator.outfile_prefix': args.json_prefix + } + cfg.merge_from_dict(cfg_json) + + # Determine whether the custom metainfo fields are all lowercase + is_metainfo_lower(cfg) + + if args.tta: + assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.' \ + " Can't use tta !" + assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` ' \ + "in config. Can't use tta !" + + cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model) + test_data_cfg = cfg.test_dataloader.dataset + while 'dataset' in test_data_cfg: + test_data_cfg = test_data_cfg['dataset'] + + # batch_shapes_cfg will force control the size of the output image, + # it is not compatible with tta. + if 'batch_shapes_cfg' in test_data_cfg: + test_data_cfg.batch_shapes_cfg = None + test_data_cfg.pipeline = cfg.tta_pipeline + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # add `DumpResults` dummy metric + if args.out is not None: + assert args.out.endswith(('.pkl', '.pickle')), \ + 'The dump file must be a pkl file.' + runner.test_evaluator.metrics.append( + DumpResults(out_file_path=args.out)) + + # start testing + runner.test() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/third_party/mmyolo/tools/train.py b/models/YOLO-World/third_party/mmyolo/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..61f94980d2236295c4ca317520842a53b1813f0a --- /dev/null +++ b/models/YOLO-World/third_party/mmyolo/tools/train.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import logging +import os +import os.path as osp + +from mmdet.utils import setup_cache_size_limit_of_dynamo +from mmengine.config import Config, DictAction +from mmengine.logging import print_log +from mmengine.runner import Runner + +from mmyolo.registry import RUNNERS +from mmyolo.utils import is_metainfo_lower + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--amp', + action='store_true', + default=False, + help='enable automatic-mixed-precision training') + parser.add_argument( + '--resume', + nargs='?', + type=str, + const='auto', + help='If specify checkpoint path, resume from it, while if not ' + 'specify, try to auto resume from the latest checkpoint ' + 'in the work directory.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` + # will pass the `--local-rank` parameter to `tools/train.py` instead + # of `--local_rank`. + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def main(): + args = parse_args() + + # Reduce the number of repeated compilations and improve + # training speed. + setup_cache_size_limit_of_dynamo() + + # load config + cfg = Config.fromfile(args.config) + # replace the ${key} with the value of cfg.key + # cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + # enable automatic-mixed-precision training + if args.amp is True: + optim_wrapper = cfg.optim_wrapper.type + if optim_wrapper == 'AmpOptimWrapper': + print_log( + 'AMP training is already enabled in your config.', + logger='current', + level=logging.WARNING) + else: + assert optim_wrapper == 'OptimWrapper', ( + '`--amp` is only supported when the optimizer wrapper type is ' + f'`OptimWrapper` but got {optim_wrapper}.') + cfg.optim_wrapper.type = 'AmpOptimWrapper' + cfg.optim_wrapper.loss_scale = 'dynamic' + + # resume is determined in this priority: resume from > auto_resume + if args.resume == 'auto': + cfg.resume = True + cfg.load_from = None + elif args.resume is not None: + cfg.resume = True + cfg.load_from = args.resume + + # Determine whether the custom metainfo fields are all lowercase + is_metainfo_lower(cfg) + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # start training + runner.train() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/tools/dist_test.sh b/models/YOLO-World/tools/dist_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..dea131b43ea8f1222661d20603d40c18ea7f28a1 --- /dev/null +++ b/models/YOLO-World/tools/dist_test.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/test.py \ + $CONFIG \ + $CHECKPOINT \ + --launcher pytorch \ + ${@:4} diff --git a/models/YOLO-World/tools/dist_train.sh b/models/YOLO-World/tools/dist_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea56f698d1f00b992eec8d481c75b273d202acf5 --- /dev/null +++ b/models/YOLO-World/tools/dist_train.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +CONFIG=$1 +GPUS=$2 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${MASTER_PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/train.py \ + $CONFIG \ + --launcher pytorch ${@:3} diff --git a/models/YOLO-World/tools/generate_image_prompts.py b/models/YOLO-World/tools/generate_image_prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..ba0d0236c282a3df3b244874899a0370a124fa29 --- /dev/null +++ b/models/YOLO-World/tools/generate_image_prompts.py @@ -0,0 +1,59 @@ +import os +import tqdm +import argparse +import os.path as osp +import numpy as np +from PIL import Image +from transformers import (AutoTokenizer, AutoProcessor, + CLIPVisionModelWithProjection, + CLIPTextModelWithProjection) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + '--model', + type=str, + default='../pretrained_models/open-ai-clip-vit-base-patch32') + parser.add_argument('--image-dir', type=str, default='data/samples.txt') + parser.add_argument('--out-dir', type=str, default='') + parser.add_argument('--out-file', type=str) + + args = parser.parse_args() + + tokenizer = AutoTokenizer.from_pretrained(args.model) + vision_model = CLIPVisionModelWithProjection.from_pretrained(args.model) + text_model = CLIPTextModelWithProjection.from_pretrained(args.model) + processor = AutoProcessor.from_pretrained(args.model) + + # padding prompts + device = 'cuda:0' + text_model.to(device) + texts = tokenizer(text=[' '], return_tensors='pt', padding=True) + texts = texts.to(device) + text_outputs = text_model(**texts) + txt_feats = text_outputs.text_embeds + txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) + txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]).cpu().data.numpy() + + images = os.listdir(args.image_dir) + category_embeds = [] + + def _forward_vision_model(image_name): + image_path = osp.join(args.image_dir, image_name) + # category = image_name.split('-')[1] + image = Image.open(image_path).convert("RGB") + inputs = processor(images=image, return_tensors="pt", padding=True) + image_outputs = vision_model(**inputs) + img_feats = image_outputs.image_embeds + # img_feats + img_feats = img_feats / img_feats.norm(p=2, dim=-1, keepdim=True) + img_feats = img_feats.reshape( + -1, img_feats.shape[-1])[0].cpu().data.numpy() + category_embeds.append(img_feats) + + for image_ in tqdm.tqdm(images): + _forward_vision_model(image_) + category_embeds.append(txt_feats) + category_embeds = np.stack(category_embeds) + np.save(osp.join(args.out_dir, args.out_file), category_embeds) diff --git a/models/YOLO-World/tools/generate_text_prompts.py b/models/YOLO-World/tools/generate_text_prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..7dc00aad87a310da0d8b5774b1af8df2176f2015 --- /dev/null +++ b/models/YOLO-World/tools/generate_text_prompts.py @@ -0,0 +1,36 @@ +import json +import argparse +import numpy as np +from transformers import (AutoTokenizer, CLIPTextModelWithProjection) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + '--model', + type=str, + default='./pretrained_models/clip-vit-base-patch32-projection') + parser.add_argument('--text', + type=str, + default='data/captions/coco_class_captions.json') + parser.add_argument('--out', type=str, default='output.npy') + + args = parser.parse_args() + + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = CLIPTextModelWithProjection.from_pretrained(args.model) + + with open(args.text) as f: + data = json.load(f) + texts = [x[0] for x in data] + device = 'cuda:0' + model.to(device) + texts = tokenizer(text=texts, return_tensors='pt', padding=True) + texts = texts.to(device) + text_outputs = model(**texts) + txt_feats = text_outputs.text_embeds + txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) + txt_feats = txt_feats.reshape(-1, txt_feats.shape[-1]) + + np.save(args.out, txt_feats.cpu().data.numpy()) diff --git a/models/YOLO-World/tools/reparameterize_yoloworld.py b/models/YOLO-World/tools/reparameterize_yoloworld.py new file mode 100644 index 0000000000000000000000000000000000000000..0257f637dc342bcec0cfe1e1f053edced7bbb0af --- /dev/null +++ b/models/YOLO-World/tools/reparameterize_yoloworld.py @@ -0,0 +1,139 @@ +import os +import argparse + +import torch +import numpy as np + + +def parse_args(): + + parser = argparse.ArgumentParser("Reparameterize YOLO-World") + parser.add_argument('--model', help='model checkpoints to reparameterize') + parser.add_argument('--out-dir', help='output checkpoints') + parser.add_argument( + '--text-embed', + help='text embeddings to reparameterized into YOLO-World') + parser.add_argument('--conv-neck', + action='store_true', + help='whether using 1x1 conv in RepVL-PAN') + + args = parser.parse_args() + return args + + +def convert_head(scale, bias, text_embed): + N, D = text_embed.shape + weight = (text_embed * scale.exp()).view(N, D, 1, 1) + bias = torch.ones(N) * bias + return weight, bias + + +def reparameterize_head(state_dict, embeds): + + cls_layers = [ + 'bbox_head.head_module.cls_contrasts.0', + 'bbox_head.head_module.cls_contrasts.1', + 'bbox_head.head_module.cls_contrasts.2' + ] + + for i in range(3): + scale = state_dict[cls_layers[i] + '.logit_scale'] + bias = state_dict[cls_layers[i] + '.bias'] + weight, bias = convert_head(scale, bias, embeds) + state_dict[cls_layers[i] + '.conv.weight'] = weight + state_dict[cls_layers[i] + '.conv.bias'] = bias + del state_dict[cls_layers[i] + '.bias'] + del state_dict[cls_layers[i] + '.logit_scale'] + return state_dict + + +def convert_neck_split_conv(input_state_dict, block_name, text_embeds, + num_heads): + if block_name + '.guide_fc.weight' not in input_state_dict: + return input_state_dict + guide_fc_weight = input_state_dict[block_name + '.guide_fc.weight'] + guide_fc_bias = input_state_dict[block_name + '.guide_fc.bias'] + guide = text_embeds @ guide_fc_weight.transpose(0, + 1) + guide_fc_bias[None, :] + N, D = guide.shape + guide = list(guide.split(D // num_heads, dim=1)) + del input_state_dict[block_name + '.guide_fc.weight'] + del input_state_dict[block_name + '.guide_fc.bias'] + for i in range(num_heads): + input_state_dict[block_name + + f'.guide_convs.{i}.weight'] = guide[i][:, :, None, + None] + return input_state_dict + + +def convert_neck_weight(input_state_dict, block_name, embeds, num_heads): + guide_fc_weight = input_state_dict[block_name + '.guide_fc.weight'] + guide_fc_bias = input_state_dict[block_name + '.guide_fc.bias'] + guide = embeds @ guide_fc_weight.transpose(0, 1) + guide_fc_bias[None, :] + N, D = guide.shape + del input_state_dict[block_name + '.guide_fc.weight'] + del input_state_dict[block_name + '.guide_fc.bias'] + input_state_dict[block_name + '.guide_weight'] = guide.view( + N, D // num_heads, num_heads) + return input_state_dict + + +def reparameterize_neck(state_dict, embeds, type='conv'): + + neck_blocks = [ + 'neck.top_down_layers.0.attn_block', + 'neck.top_down_layers.1.attn_block', + 'neck.bottom_up_layers.0.attn_block', + 'neck.bottom_up_layers.1.attn_block' + ] + if "neck.top_down_layers.0.attn_block.bias" not in state_dict: + return state_dict + for block in neck_blocks: + num_heads = state_dict[block + '.bias'].shape[0] + if type == 'conv': + convert_neck_split_conv(state_dict, block, embeds, num_heads) + else: + convert_neck_weight(state_dict, block, embeds, num_heads) + return state_dict + + +def main(): + + args = parse_args() + + # load checkpoint + model = torch.load(args.model, map_location='cpu') + state_dict = model['state_dict'] + + # load embeddings + embeddings = torch.from_numpy(np.load(args.text_embed)) + + # remove text encoder + keys = list(state_dict.keys()) + keys = [x for x in keys if "text_model" not in x] + + state_dict_wo_text = {x: state_dict[x] for x in keys} + print("removing text encoder") + + state_dict_wo_text = reparameterize_head(state_dict_wo_text, embeddings) + print("reparameterizing head") + + if args.conv_neck: + neck_type = "conv" + else: + neck_type = "linear" + + state_dict_wo_text = reparameterize_neck(state_dict_wo_text, embeddings, + neck_type) + + print("reparameterizing neck") + + model['state_dict'] = state_dict_wo_text + + model_name = os.path.basename(args.model) + model_name = model_name.replace('.pth', f'_rep_{neck_type}.pth') + torch.save(model, os.path.join(args.out_dir, model_name)) + + +if __name__ == "__main__": + main() diff --git a/models/YOLO-World/tools/test.py b/models/YOLO-World/tools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..c05defe3c70a4cf4b8775a98bb89a84b7faba63a --- /dev/null +++ b/models/YOLO-World/tools/test.py @@ -0,0 +1,150 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +from mmdet.engine.hooks.utils import trigger_visualization_hook +from mmengine.config import Config, ConfigDict, DictAction +from mmengine.evaluator import DumpResults +from mmengine.runner import Runner + +from mmyolo.registry import RUNNERS +from mmyolo.utils import is_metainfo_lower + + +# TODO: support fuse_conv_bn +def parse_args(): + parser = argparse.ArgumentParser( + description='MMYOLO test (and eval) a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing evaluation metrics') + parser.add_argument( + '--out', + type=str, + help='output result file (must be a .pkl file) in pickle format') + parser.add_argument( + '--json-prefix', + type=str, + help='the prefix of the output json file without perform evaluation, ' + 'which is useful when you want to format the result to a specific ' + 'format and submit it to the test server') + parser.add_argument( + '--tta', + action='store_true', + help='Whether to use test time augmentation') + parser.add_argument( + '--show', action='store_true', help='show prediction results') + parser.add_argument( + '--deploy', + action='store_true', + help='Switch model to deployment mode') + parser.add_argument( + '--show-dir', + help='directory where painted images will be saved. ' + 'If specified, it will be automatically saved ' + 'to the work_dir/timestamp/show_dir') + parser.add_argument( + '--wait-time', type=float, default=2, help='the interval of show (s)') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + # replace the ${key} with the value of cfg.key + # cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + + if args.show or args.show_dir: + cfg = trigger_visualization_hook(cfg, args) + + if args.deploy: + cfg.custom_hooks.append(dict(type='SwitchToDeployHook')) + + # add `format_only` and `outfile_prefix` into cfg + if args.json_prefix is not None: + cfg_json = { + 'test_evaluator.format_only': True, + 'test_evaluator.outfile_prefix': args.json_prefix + } + cfg.merge_from_dict(cfg_json) + + # Determine whether the custom metainfo fields are all lowercase + is_metainfo_lower(cfg) + + if args.tta: + assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.' \ + " Can't use tta !" + assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` ' \ + "in config. Can't use tta !" + + cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model) + test_data_cfg = cfg.test_dataloader.dataset + while 'dataset' in test_data_cfg: + test_data_cfg = test_data_cfg['dataset'] + + # batch_shapes_cfg will force control the size of the output image, + # it is not compatible with tta. + if 'batch_shapes_cfg' in test_data_cfg: + test_data_cfg.batch_shapes_cfg = None + test_data_cfg.pipeline = cfg.tta_pipeline + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # add `DumpResults` dummy metric + if args.out is not None: + assert args.out.endswith(('.pkl', '.pickle')), \ + 'The dump file must be a pkl file.' + runner.test_evaluator.metrics.append( + DumpResults(out_file_path=args.out)) + + # start testing + runner.test() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/tools/train.py b/models/YOLO-World/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f634972af714badd6c501218e4774df58275d0d1 --- /dev/null +++ b/models/YOLO-World/tools/train.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import logging +import os +import os.path as osp + +from mmengine.config import Config, DictAction +from mmengine.logging import print_log +from mmengine.runner import Runner + +from mmyolo.registry import RUNNERS +from mmyolo.utils import is_metainfo_lower + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--amp', + action='store_true', + default=False, + help='enable automatic-mixed-precision training') + parser.add_argument( + '--resume', + nargs='?', + type=str, + const='auto', + help='If specify checkpoint path, resume from it, while if not ' + 'specify, try to auto resume from the latest checkpoint ' + 'in the work directory.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + # replace the ${key} with the value of cfg.key + # cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + if args.config.startswith('projects/'): + config = args.config[len('projects/'):] + config = config.replace('/configs/', '/') + cfg.work_dir = osp.join('./work_dirs', osp.splitext(config)[0]) + else: + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + # enable automatic-mixed-precision training + if args.amp is True: + optim_wrapper = cfg.optim_wrapper.type + if optim_wrapper == 'AmpOptimWrapper': + print_log( + 'AMP training is already enabled in your config.', + logger='current', + level=logging.WARNING) + else: + assert optim_wrapper == 'OptimWrapper', ( + '`--amp` is only supported when the optimizer wrapper type is ' + f'`OptimWrapper` but got {optim_wrapper}.') + cfg.optim_wrapper.type = 'AmpOptimWrapper' + cfg.optim_wrapper.loss_scale = 'dynamic' + + # resume is determined in this priority: resume from > auto_resume + if args.resume == 'auto': + cfg.resume = True + cfg.load_from = None + elif args.resume is not None: + cfg.resume = True + cfg.load_from = args.resume + + # Determine whether the custom metainfo fields are all lowercase + is_metainfo_lower(cfg) + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # start training + runner.train() + + +if __name__ == '__main__': + main() diff --git a/models/YOLO-World/yolo_world/__init__.py b/models/YOLO-World/yolo_world/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3ce4f9f061c4ac63b9020cb5ddaaf8e2c8930315 --- /dev/null +++ b/models/YOLO-World/yolo_world/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import importlib.metadata as importlib_metadata + +try: + __version__ = importlib_metadata.version(__package__ or __name__) +except importlib_metadata.PackageNotFoundError: + __version__ = '0.0.0' + + +from .models import * # noqa +from .datasets import * # noqa +from .engine import * # noqa diff --git a/models/YOLO-World/yolo_world/datasets/__init__.py b/models/YOLO-World/yolo_world/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3fbdad0ca10bca182c7323295d898afc03bd3913 --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .mm_dataset import ( + MultiModalDataset, MultiModalMixedDataset) +from .yolov5_obj365v1 import YOLOv5Objects365V1Dataset +from .yolov5_obj365v2 import YOLOv5Objects365V2Dataset +from .yolov5_mixed_grounding import YOLOv5MixedGroundingDataset +from .utils import yolow_collate +from .transformers import * # NOQA +from .yolov5_v3det import YOLOv5V3DetDataset +from .yolov5_lvis import YOLOv5LVISV1Dataset + +__all__ = [ + 'MultiModalDataset', 'YOLOv5Objects365V1Dataset', + 'YOLOv5Objects365V2Dataset', 'YOLOv5MixedGroundingDataset', + 'YOLOv5V3DetDataset', 'yolow_collate', + 'YOLOv5LVISV1Dataset', 'MultiModalMixedDataset', +] diff --git a/models/YOLO-World/yolo_world/datasets/mm_dataset.py b/models/YOLO-World/yolo_world/datasets/mm_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..510e8b935fc85a570247b92b2459eaf160632199 --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/mm_dataset.py @@ -0,0 +1,122 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import copy +import json +import logging +from typing import Callable, List, Union + +from mmengine.logging import print_log +from mmengine.dataset.base_dataset import ( + BaseDataset, Compose, force_full_init) +from mmyolo.registry import DATASETS + + +@DATASETS.register_module() +class MultiModalDataset: + """Multi-modal dataset.""" + + def __init__(self, + dataset: Union[BaseDataset, dict], + class_text_path: str = None, + test_mode: bool = True, + pipeline: List[Union[dict, Callable]] = [], + lazy_init: bool = False) -> None: + self.dataset: BaseDataset + if isinstance(dataset, dict): + self.dataset = DATASETS.build(dataset) + elif isinstance(dataset, BaseDataset): + self.dataset = dataset + else: + raise TypeError( + 'dataset must be a dict or a BaseDataset, ' + f'but got {dataset}') + + if class_text_path is not None: + self.class_texts = json.load(open(class_text_path, 'r')) + # ori_classes = self.dataset.metainfo['classes'] + # assert len(ori_classes) == len(self.class_texts), \ + # ('The number of classes in the dataset and the class text' + # 'file must be the same.') + else: + self.class_texts = None + + self.test_mode = test_mode + self._metainfo = self.dataset.metainfo + self.pipeline = Compose(pipeline) + + self._fully_initialized = False + if not lazy_init: + self.full_init() + + @property + def metainfo(self) -> dict: + return copy.deepcopy(self._metainfo) + + def full_init(self) -> None: + """``full_init`` dataset.""" + if self._fully_initialized: + return + + self.dataset.full_init() + self._ori_len = len(self.dataset) + self._fully_initialized = True + + @force_full_init + def get_data_info(self, idx: int) -> dict: + """Get annotation by index.""" + data_info = self.dataset.get_data_info(idx) + if self.class_texts is not None: + data_info.update({'texts': self.class_texts}) + return data_info + + def __getitem__(self, idx): + if not self._fully_initialized: + print_log( + 'Please call `full_init` method manually to ' + 'accelerate the speed.', + logger='current', + level=logging.WARNING) + self.full_init() + + data_info = self.get_data_info(idx) + + if hasattr(self.dataset, 'test_mode') and not self.dataset.test_mode: + data_info['dataset'] = self + elif not self.test_mode: + data_info['dataset'] = self + return self.pipeline(data_info) + + @force_full_init + def __len__(self) -> int: + return self._ori_len + + +@DATASETS.register_module() +class MultiModalMixedDataset(MultiModalDataset): + """Multi-modal Mixed dataset. + mix "detection dataset" and "caption dataset" + Args: + dataset_type (str): dataset type, 'detection' or 'caption' + """ + def __init__(self, + dataset: Union[BaseDataset, dict], + class_text_path: str = None, + dataset_type: str = 'detection', + test_mode: bool = True, + pipeline: List[Union[dict, Callable]] = [], + lazy_init: bool = False) -> None: + self.dataset_type = dataset_type + super().__init__(dataset, + class_text_path, + test_mode, + pipeline, + lazy_init) + + @force_full_init + def get_data_info(self, idx: int) -> dict: + """Get annotation by index.""" + data_info = self.dataset.get_data_info(idx) + if self.class_texts is not None: + data_info.update({'texts': self.class_texts}) + data_info['is_detection'] = 1 \ + if self.dataset_type == 'detection' else 0 + return data_info diff --git a/models/YOLO-World/yolo_world/datasets/transformers/__init__.py b/models/YOLO-World/yolo_world/datasets/transformers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..495e981551f7ae51761a97e4e41e141c43fbc536 --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/transformers/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .mm_transforms import RandomLoadText, LoadText +from .mm_mix_img_transforms import ( + MultiModalMosaic, MultiModalMosaic9, YOLOv5MultiModalMixUp, + YOLOXMultiModalMixUp) + +__all__ = ['RandomLoadText', 'LoadText', 'MultiModalMosaic', + 'MultiModalMosaic9', 'YOLOv5MultiModalMixUp', + 'YOLOXMultiModalMixUp'] diff --git a/models/YOLO-World/yolo_world/datasets/transformers/mm_mix_img_transforms.py b/models/YOLO-World/yolo_world/datasets/transformers/mm_mix_img_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..0f4dfe084713a16438d13376ff36fd9265022a4e --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/transformers/mm_mix_img_transforms.py @@ -0,0 +1,1173 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import collections +import copy +from abc import ABCMeta, abstractmethod +from typing import Optional, Sequence, Tuple, Union + +import mmcv +import numpy as np +from mmcv.transforms import BaseTransform +from mmdet.structures.bbox import autocast_box_type +from mmengine.dataset import BaseDataset +from mmengine.dataset.base_dataset import Compose +from numpy import random +from mmyolo.registry import TRANSFORMS + + +class BaseMultiModalMixImageTransform(BaseTransform, metaclass=ABCMeta): + """A Base Transform of Multimodal multiple images mixed. + + Suitable for training on multiple images mixed data augmentation like + mosaic and mixup. + + Cached mosaic transform will random select images from the cache + and combine them into one output image if use_cached is True. + + Args: + pre_transform(Sequence[str]): Sequence of transform object or + config dict to be composed. Defaults to None. + prob(float): The transformation probability. Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + pre_transform: Optional[Sequence[str]] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + + self.max_refetch = max_refetch + self.prob = prob + + self.use_cached = use_cached + self.max_cached_images = max_cached_images + self.random_pop = random_pop + self.results_cache = [] + + if pre_transform is None: + self.pre_transform = None + else: + self.pre_transform = Compose(pre_transform) + + @abstractmethod + def get_indexes(self, dataset: Union[BaseDataset, + list]) -> Union[list, int]: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list or int: indexes. + """ + pass + + @abstractmethod + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + pass + + def _update_label_text(self, results: dict) -> dict: + """Update label text.""" + if 'texts' not in results: + return results + + mix_texts = sum( + [results['texts']] + + [x['texts'] for x in results['mix_results']], []) + mix_texts = list({tuple(x) for x in mix_texts}) + text2id = {text: i for i, text in enumerate(mix_texts)} + + for res in [results] + results['mix_results']: + for i, label in enumerate(res['gt_bboxes_labels']): + text = res['texts'][label] + updated_id = text2id[tuple(text)] + res['gt_bboxes_labels'][i] = updated_id + res['texts'] = mix_texts + return results + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Data augmentation function. + + The transform steps are as follows: + 1. Randomly generate index list of other images. + 2. Before Mosaic or MixUp need to go through the necessary + pre_transform, such as MixUp' pre_transform pipeline + include: 'LoadImageFromFile','LoadAnnotations', + 'Mosaic' and 'RandomAffine'. + 3. Use mix_img_transform function to implement specific + mix operations. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + + if random.uniform(0, 1) > self.prob: + return results + + if self.use_cached: + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + self.results_cache.append(copy.deepcopy(results)) + if len(self.results_cache) > self.max_cached_images: + if self.random_pop: + index = random.randint(0, len(self.results_cache) - 1) + else: + index = 0 + self.results_cache.pop(index) + + if len(self.results_cache) <= 4: + return results + else: + assert 'dataset' in results + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + + for _ in range(self.max_refetch): + # get index of one or three other images + if self.use_cached: + indexes = self.get_indexes(self.results_cache) + else: + indexes = self.get_indexes(dataset) + + if not isinstance(indexes, collections.abc.Sequence): + indexes = [indexes] + + if self.use_cached: + mix_results = [ + copy.deepcopy(self.results_cache[i]) for i in indexes + ] + else: + # get images information will be used for Mosaic or MixUp + mix_results = [ + copy.deepcopy(dataset.get_data_info(index)) + for index in indexes + ] + + if self.pre_transform is not None: + for i, data in enumerate(mix_results): + # pre_transform may also require dataset + data.update({'dataset': dataset}) + # before Mosaic or MixUp need to go through + # the necessary pre_transform + _results = self.pre_transform(data) + _results.pop('dataset') + mix_results[i] = _results + + if None not in mix_results: + results['mix_results'] = mix_results + break + print('Repeated calculation') + else: + raise RuntimeError( + 'The loading pipeline of the original dataset' + ' always return None. Please check the correctness ' + 'of the dataset and its pipeline.') + + # update labels and texts + results = self._update_label_text(results) + + # Mosaic or MixUp + results = self.mix_img_transform(results) + + if 'mix_results' in results: + results.pop('mix_results') + results['dataset'] = dataset + + return results + + +@TRANSFORMS.register_module() +class MultiModalMosaic(BaseMultiModalMixImageTransform): + """Mosaic augmentation. + + Given 4 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + mosaic transform + center_x + +------------------------------+ + | pad | | + | +-----------+ pad | + | | | | + | | image1 +-----------+ + | | | | + | | | image2 | + center_y |----+-+-----------+-----------+ + | | cropped | | + |pad | image3 | image4 | + | | | | + +----|-------------+-----------+ + | | + +-------------+ + + The mosaic transform steps are as follows: + + 1. Choose the mosaic center as the intersections of 4 images + 2. Get the left top image according to the index, and randomly + sample another 3 images from the custom dataset. + 3. Sub image will be cropped if image is larger than mosaic patch + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + center_ratio_range (Sequence[float]): Center ratio range of mosaic + output. Defaults to (0.5, 1.5). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 4, 'The length of cache must >= 4, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.center_ratio_range = center_ratio_range + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(3)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + # print("use mosaic") + assert 'mix_results' in results + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + mosaic_masks = [] + with_mask = True if 'gt_masks' in results else False + # print("with_mask: ", with_mask) + # self.img_scale is wh format + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 2), int(img_scale_w * 2), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2)), + self.pad_val, + dtype=results['img'].dtype) + + # mosaic center x, y + center_x = int(random.uniform(*self.center_ratio_range) * img_scale_w) + center_y = int(random.uniform(*self.center_ratio_range) * img_scale_h) + center_position = (center_x, center_y) + + loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') + for i, loc in enumerate(loc_strs): + if loc == 'top_left': + results_patch = results + else: + results_patch = results['mix_results'][i - 1] + + img_i = results_patch['img'] + h_i, w_i = img_i.shape[:2] + # keep_ratio resize + scale_ratio_i = min(img_scale_h / h_i, img_scale_w / w_i) + img_i = mmcv.imresize( + img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) + + # compute the combine parameters + paste_coord, crop_coord = self._mosaic_combine( + loc, center_position, img_i.shape[:2][::-1]) + x1_p, y1_p, x2_p, y2_p = paste_coord + x1_c, y1_c, x2_c, y2_c = crop_coord + + # crop and paste image + mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c] + + # adjust coordinate + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + + padw = x1_p - x1_c + padh = y1_p - y1_c + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + if with_mask and results_patch.get('gt_masks', None) is not None: + gt_masks_i = results_patch['gt_masks'] + gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i)) + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padw, + direction='horizontal') + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padh, + direction='vertical') + mosaic_masks.append(gt_masks_i) + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks) + results['gt_masks'] = mosaic_masks + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks)[inside_inds] + results['gt_masks'] = mosaic_masks + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + + return results + + def _mosaic_combine( + self, loc: str, center_position_xy: Sequence[float], + img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]: + """Calculate global coordinate of mosaic image and local coordinate of + cropped sub-image. + + Args: + loc (str): Index for the sub-image, loc in ('top_left', + 'top_right', 'bottom_left', 'bottom_right'). + center_position_xy (Sequence[float]): Mixing center for 4 images, + (x, y). + img_shape_wh (Sequence[int]): Width and height of sub-image + + Returns: + tuple[tuple[float]]: Corresponding coordinate of pasting and + cropping + - paste_coord (tuple): paste corner coordinate in mosaic image. + - crop_coord (tuple): crop corner coordinate in mosaic image. + """ + assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right') + if loc == 'top_left': + # index0 to top left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + center_position_xy[0], \ + center_position_xy[1] + crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - ( + y2 - y1), img_shape_wh[0], img_shape_wh[1] + + elif loc == 'top_right': + # index1 to top right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + center_position_xy[1] + crop_coord = 0, img_shape_wh[1] - (y2 - y1), min( + img_shape_wh[0], x2 - x1), img_shape_wh[1] + + elif loc == 'bottom_left': + # index2 to bottom left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + center_position_xy[1], \ + center_position_xy[0], \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min( + y2 - y1, img_shape_wh[1]) + + else: + # index3 to bottom right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + center_position_xy[1], \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = 0, 0, min(img_shape_wh[0], + x2 - x1), min(y2 - y1, img_shape_wh[1]) + + paste_coord = x1, y1, x2, y2 + return paste_coord, crop_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'center_ratio_range={self.center_ratio_range}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class MultiModalMosaic9(BaseMultiModalMixImageTransform): + """Mosaic9 augmentation. + + Given 9 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + +-------------------------------+------------+ + | pad | pad | | + | +----------+ | | + | | +---------------+ top_right | + | | | top | image2 | + | | top_left | image1 | | + | | image8 o--------+------+--------+---+ + | | | | | | + +----+----------+ | right |pad| + | | center | image3 | | + | left | image0 +---------------+---| + | image7 | | | | + +---+-----------+---+--------+ | | + | | cropped | | bottom_right |pad| + | |bottom_left| | image4 | | + | | image6 | bottom | | | + +---|-----------+ image5 +---------------+---| + | pad | | pad | + +-----------+------------+-------------------+ + + The mosaic transform steps are as follows: + + 1. Get the center image according to the index, and randomly + sample another 8 images from the custom dataset. + 2. Randomly offset the image after Mosaic + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 5 caches for each image suffices for + randomness. Defaults to 50. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + bbox_clip_border: bool = True, + pad_val: Union[float, int] = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 50, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 9, 'The length of cache must >= 9, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + # intermediate variables + self._current_img_shape = [0, 0] + self._center_img_shape = [0, 0] + self._previous_img_shape = [0, 0] + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(8)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 3), int(img_scale_w * 3), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 3), int(img_scale_w * 3)), + self.pad_val, + dtype=results['img'].dtype) + + # index = 0 is mean original image + # len(results['mix_results']) = 8 + loc_strs = ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + results_all = [results, *results['mix_results']] + for index, results_patch in enumerate(results_all): + img_i = results_patch['img'] + # keep_ratio resize + img_i_h, img_i_w = img_i.shape[:2] + scale_ratio_i = min(img_scale_h / img_i_h, img_scale_w / img_i_w) + img_i = mmcv.imresize( + img_i, + (int(img_i_w * scale_ratio_i), int(img_i_h * scale_ratio_i))) + + paste_coord = self._mosaic_combine(loc_strs[index], + img_i.shape[:2]) + + padw, padh = paste_coord[:2] + x1, y1, x2, y2 = (max(x, 0) for x in paste_coord) + mosaic_img[y1:y2, x1:x2] = img_i[y1 - padh:, x1 - padw:] + + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + + # Offset + offset_x = int(random.uniform(0, img_scale_w)) + offset_y = int(random.uniform(0, img_scale_h)) + mosaic_img = mosaic_img[offset_y:offset_y + 2 * img_scale_h, + offset_x:offset_x + 2 * img_scale_w] + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes.translate_([-offset_x, -offset_y]) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + return results + + def _mosaic_combine(self, loc: str, + img_shape_hw: Tuple[int, int]) -> Tuple[int, ...]: + """Calculate global coordinate of mosaic image. + + Args: + loc (str): Index for the sub-image. + img_shape_hw (Sequence[int]): Height and width of sub-image + + Returns: + paste_coord (tuple): paste corner coordinate in mosaic image. + """ + assert loc in ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + img_scale_w, img_scale_h = self.img_scale + + self._current_img_shape = img_shape_hw + current_img_h, current_img_w = self._current_img_shape + previous_img_h, previous_img_w = self._previous_img_shape + center_img_h, center_img_w = self._center_img_shape + + if loc == 'center': + self._center_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + paste_coord = img_scale_w, \ + img_scale_h, \ + img_scale_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'top': + paste_coord = img_scale_w, \ + img_scale_h - current_img_h, \ + img_scale_w + current_img_w, \ + img_scale_h + elif loc == 'top_right': + paste_coord = img_scale_w + previous_img_w, \ + img_scale_h - current_img_h, \ + img_scale_w + previous_img_w + current_img_w, \ + img_scale_h + elif loc == 'right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'bottom_right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h + previous_img_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + previous_img_h + current_img_h + elif loc == 'bottom': + paste_coord = img_scale_w + center_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'bottom_left': + paste_coord = img_scale_w + center_img_w - \ + previous_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w - previous_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h + elif loc == 'top_left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - \ + previous_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h - previous_img_h + + self._previous_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + return paste_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5MultiModalMixUp(BaseMultiModalMixImageTransform): + """MixUp data augmentation for YOLOv5. + + .. code:: text + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset. + 2. Randomly obtain the fusion ratio from the beta distribution, + then fuse the target + of the original image and mixup image through this ratio. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + alpha (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + beta (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + pre_transform (Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + alpha: float = 32.0, + beta: float = 32.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.alpha = alpha + self.beta = beta + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOv5 MixUp transform function. + + Args: + results (dict): Result dict + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + ori_img = results['img'] + assert ori_img.shape == retrieve_img.shape + + # Randomly obtain the fusion ratio from the beta distribution, + # which is around 0.5 + ratio = np.random.beta(self.alpha, self.beta) + mixup_img = (ori_img * ratio + retrieve_img * (1 - ratio)) + + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = retrieve_gt_bboxes.cat( + (results['gt_bboxes'], retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + if 'gt_masks' in results: + assert 'gt_masks' in retrieve_results + mixup_gt_masks = results['gt_masks'].cat( + [results['gt_masks'], retrieve_results['gt_masks']]) + results['gt_masks'] = mixup_gt_masks + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + +@TRANSFORMS.register_module() +class YOLOXMultiModalMixUp(BaseMultiModalMixImageTransform): + """MixUp data augmentation for YOLOX. + + .. code:: text + + mixup transform + +---------------+--------------+ + | mixup image | | + | +--------|--------+ | + | | | | | + +---------------+ | | + | | | | + | | image | | + | | | | + | | | | + | +-----------------+ | + | pad | + +------------------------------+ + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset and embedded in + the top left patch(after padding and resizing) + 2. The target of mixup transform is the weighted average of mixup + image and origin image. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + img_scale (Sequence[int]): Image output size after mixup pipeline. + The shape order should be (width, height). Defaults to (640, 640). + ratio_range (Sequence[float]): Scale ratio of mixup image. + Defaults to (0.5, 1.5). + flip_ratio (float): Horizontal flip ratio of mixup image. + Defaults to 0.5. + pad_val (int): Pad value. Defaults to 114. + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + ratio_range: Tuple[float, float] = (0.5, 1.5), + flip_ratio: float = 0.5, + pad_val: float = 114.0, + bbox_clip_border: bool = True, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.img_scale = img_scale + self.ratio_range = ratio_range + self.flip_ratio = flip_ratio + self.pad_val = pad_val + self.bbox_clip_border = bbox_clip_border + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOX MixUp transform function. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + assert len( + results['mix_results']) == 1, 'MixUp only support 2 images now !' + + if results['mix_results'][0]['gt_bboxes'].shape[0] == 0: + # empty bbox + return results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + + jit_factor = random.uniform(*self.ratio_range) + is_filp = random.uniform(0, 1) > self.flip_ratio + + if len(retrieve_img.shape) == 3: + out_img = np.ones((self.img_scale[1], self.img_scale[0], 3), + dtype=retrieve_img.dtype) * self.pad_val + else: + out_img = np.ones( + self.img_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val + + # 1. keep_ratio resize + scale_ratio = min(self.img_scale[1] / retrieve_img.shape[0], + self.img_scale[0] / retrieve_img.shape[1]) + retrieve_img = mmcv.imresize( + retrieve_img, (int(retrieve_img.shape[1] * scale_ratio), + int(retrieve_img.shape[0] * scale_ratio))) + + # 2. paste + out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img + + # 3. scale jit + scale_ratio *= jit_factor + out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), + int(out_img.shape[0] * jit_factor))) + + # 4. flip + if is_filp: + out_img = out_img[:, ::-1, :] + + # 5. random crop + ori_img = results['img'] + origin_h, origin_w = out_img.shape[:2] + target_h, target_w = ori_img.shape[:2] + padded_img = np.ones((max(origin_h, target_h), max( + origin_w, target_w), 3)) * self.pad_val + padded_img = padded_img.astype(np.uint8) + padded_img[:origin_h, :origin_w] = out_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w) + padded_cropped_img = padded_img[y_offset:y_offset + target_h, + x_offset:x_offset + target_w] + + # 6. adjust bbox + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio]) + if self.bbox_clip_border: + retrieve_gt_bboxes.clip_([origin_h, origin_w]) + + if is_filp: + retrieve_gt_bboxes.flip_([origin_h, origin_w], + direction='horizontal') + + # 7. filter + cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone() + cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset]) + if self.bbox_clip_border: + cp_retrieve_gt_bboxes.clip_([target_h, target_w]) + + # 8. mix up + mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img + + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat( + (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + + if not self.bbox_clip_border: + # remove outside bbox + inside_inds = mixup_gt_bboxes.is_inside([target_h, + target_w]).numpy() + mixup_gt_bboxes = mixup_gt_bboxes[inside_inds] + mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds] + mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds] + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'flip_ratio={self.flip_ratio}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'max_refetch={self.max_refetch}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str diff --git a/models/YOLO-World/yolo_world/datasets/transformers/mm_transforms.py b/models/YOLO-World/yolo_world/datasets/transformers/mm_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..0008920b82fc29b3ccc0473e894cd718cdb21fa4 --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/transformers/mm_transforms.py @@ -0,0 +1,129 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import json +import random +from typing import Tuple + +import numpy as np +from mmyolo.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class RandomLoadText: + + def __init__(self, + text_path: str = None, + prompt_format: str = '{}', + num_neg_samples: Tuple[int, int] = (80, 80), + max_num_samples: int = 80, + padding_to_max: bool = False, + padding_value: str = '') -> None: + self.prompt_format = prompt_format + self.num_neg_samples = num_neg_samples + self.max_num_samples = max_num_samples + self.padding_to_max = padding_to_max + self.padding_value = padding_value + if text_path is not None: + with open(text_path, 'r') as f: + self.class_texts = json.load(f) + + def __call__(self, results: dict) -> dict: + assert 'texts' in results or hasattr(self, 'class_texts'), ( + 'No texts found in results.') + class_texts = results.get( + 'texts', + getattr(self, 'class_texts', None)) + + num_classes = len(class_texts) + if 'gt_labels' in results: + gt_label_tag = 'gt_labels' + elif 'gt_bboxes_labels' in results: + gt_label_tag = 'gt_bboxes_labels' + else: + raise ValueError('No valid labels found in results.') + positive_labels = set(results[gt_label_tag]) + + if len(positive_labels) > self.max_num_samples: + positive_labels = set(random.sample(list(positive_labels), + k=self.max_num_samples)) + + num_neg_samples = min( + min(num_classes, self.max_num_samples) - len(positive_labels), + random.randint(*self.num_neg_samples)) + candidate_neg_labels = [] + for idx in range(num_classes): + if idx not in positive_labels: + candidate_neg_labels.append(idx) + negative_labels = random.sample( + candidate_neg_labels, k=num_neg_samples) + + sampled_labels = list(positive_labels) + list(negative_labels) + random.shuffle(sampled_labels) + + label2ids = {label: i for i, label in enumerate(sampled_labels)} + + gt_valid_mask = np.zeros(len(results['gt_bboxes']), dtype=bool) + for idx, label in enumerate(results[gt_label_tag]): + if label in label2ids: + gt_valid_mask[idx] = True + results[gt_label_tag][idx] = label2ids[label] + results['gt_bboxes'] = results['gt_bboxes'][gt_valid_mask] + results[gt_label_tag] = results[gt_label_tag][gt_valid_mask] + + if 'instances' in results: + retaged_instances = [] + for idx, inst in enumerate(results['instances']): + label = inst['bbox_label'] + if label in label2ids: + inst['bbox_label'] = label2ids[label] + retaged_instances.append(inst) + results['instances'] = retaged_instances + + texts = [] + for label in sampled_labels: + cls_caps = class_texts[label] + assert len(cls_caps) > 0 + cap_id = random.randrange(len(cls_caps)) + sel_cls_cap = self.prompt_format.format(cls_caps[cap_id]) + texts.append(sel_cls_cap) + + if self.padding_to_max: + num_valid_labels = len(positive_labels) + len(negative_labels) + num_padding = self.max_num_samples - num_valid_labels + if num_padding > 0: + texts += [self.padding_value] * num_padding + + results['texts'] = texts + + return results + + +@TRANSFORMS.register_module() +class LoadText: + + def __init__(self, + text_path: str = None, + prompt_format: str = '{}', + multi_prompt_flag: str = '/') -> None: + self.prompt_format = prompt_format + self.multi_prompt_flag = multi_prompt_flag + if text_path is not None: + with open(text_path, 'r') as f: + self.class_texts = json.load(f) + + def __call__(self, results: dict) -> dict: + assert 'texts' in results or hasattr(self, 'class_texts'), ( + 'No texts found in results.') + class_texts = results.get( + 'texts', + getattr(self, 'class_texts', None)) + + texts = [] + for idx, cls_caps in enumerate(class_texts): + assert len(cls_caps) > 0 + sel_cls_cap = cls_caps[0] + sel_cls_cap = self.prompt_format.format(sel_cls_cap) + texts.append(sel_cls_cap) + + results['texts'] = texts + + return results diff --git a/models/YOLO-World/yolo_world/datasets/utils.py b/models/YOLO-World/yolo_world/datasets/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4731a45410c0d964ae0fa1b1a863304850a1eee7 --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/utils.py @@ -0,0 +1,60 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +import torch +from mmengine.dataset import COLLATE_FUNCTIONS + + +@COLLATE_FUNCTIONS.register_module() +def yolow_collate(data_batch: Sequence, + use_ms_training: bool = False) -> dict: + """Rewrite collate_fn to get faster training speed. + + Args: + data_batch (Sequence): Batch of data. + use_ms_training (bool): Whether to use multi-scale training. + """ + batch_imgs = [] + batch_bboxes_labels = [] + batch_masks = [] + for i in range(len(data_batch)): + datasamples = data_batch[i]['data_samples'] + inputs = data_batch[i]['inputs'] + batch_imgs.append(inputs) + + gt_bboxes = datasamples.gt_instances.bboxes.tensor + gt_labels = datasamples.gt_instances.labels + if 'masks' in datasamples.gt_instances: + masks = datasamples.gt_instances.masks.to( + dtype=torch.bool, device=gt_bboxes.device) + batch_masks.append(masks) + batch_idx = gt_labels.new_full((len(gt_labels), 1), i) + bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes), + dim=1) + batch_bboxes_labels.append(bboxes_labels) + + collated_results = { + 'data_samples': { + 'bboxes_labels': torch.cat(batch_bboxes_labels, 0) + } + } + if len(batch_masks) > 0: + collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0) + + if use_ms_training: + collated_results['inputs'] = batch_imgs + else: + collated_results['inputs'] = torch.stack(batch_imgs, 0) + + if hasattr(data_batch[0]['data_samples'], 'texts'): + batch_texts = [meta['data_samples'].texts for meta in data_batch] + collated_results['data_samples']['texts'] = batch_texts + + if hasattr(data_batch[0]['data_samples'], 'is_detection'): + # detection flag + batch_detection = [meta['data_samples'].is_detection + for meta in data_batch] + collated_results['data_samples']['is_detection'] = torch.tensor( + batch_detection) + + return collated_results diff --git a/models/YOLO-World/yolo_world/datasets/yolov5_lvis.py b/models/YOLO-World/yolo_world/datasets/yolov5_lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..32585044ed82e839e77b414229e2e53481e9eaf5 --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/yolov5_lvis.py @@ -0,0 +1,15 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from mmdet.datasets import LVISV1Dataset + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from mmyolo.registry import DATASETS + + +@DATASETS.register_module() +class YOLOv5LVISV1Dataset(BatchShapePolicyDataset, LVISV1Dataset): + """Dataset for YOLOv5 LVIS Dataset. + + We only add `BatchShapePolicy` function compared with Objects365V1Dataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/models/YOLO-World/yolo_world/datasets/yolov5_mixed_grounding.py b/models/YOLO-World/yolo_world/datasets/yolov5_mixed_grounding.py new file mode 100644 index 0000000000000000000000000000000000000000..98dd03104e3160b5226ace6eb6a432534125fcf4 --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/yolov5_mixed_grounding.py @@ -0,0 +1,200 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import os.path as osp +from typing import List, Union + +from mmengine.fileio import get_local_path, join_path +from mmengine.utils import is_abs +from mmdet.datasets.coco import CocoDataset +from mmyolo.registry import DATASETS +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset + + +@DATASETS.register_module() +class YOLOv5MixedGroundingDataset(BatchShapePolicyDataset, CocoDataset): + """Mixed grounding dataset.""" + + METAINFO = { + 'classes': ('object',), + 'palette': [(220, 20, 60)]} + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.coco = self.COCOAPI(local_path) + + img_ids = self.coco.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + # print(len(data_list)) + return data_list + + def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. + + Args: + raw_data_info (dict): Raw data information load from ``ann_file`` + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + img_info = raw_data_info['raw_img_info'] + ann_info = raw_data_info['raw_ann_info'] + + data_info = {} + + img_path = None + img_prefix = self.data_prefix.get('img', None) + if isinstance(img_prefix, str): + img_path = osp.join(img_prefix, img_info['file_name']) + elif isinstance(img_prefix, (list, tuple)): + for prefix in img_prefix: + candidate_img_path = osp.join(prefix, img_info['file_name']) + if osp.exists(candidate_img_path): + img_path = candidate_img_path + break + assert img_path is not None, ( + f'Image path {img_info["file_name"]} not found in' + f'{img_prefix}') + if self.data_prefix.get('seg', None): + seg_map_path = osp.join( + self.data_prefix['seg'], + img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix) + else: + seg_map_path = None + data_info['img_path'] = img_path + data_info['img_id'] = img_info['img_id'] + data_info['seg_map_path'] = seg_map_path + data_info['height'] = float(img_info['height']) + data_info['width'] = float(img_info['width']) + + cat2id = {} + texts = [] + for ann in ann_info: + cat_name = ' '.join([img_info['caption'][t[0]:t[1]] + for t in ann['tokens_positive']]) + if cat_name not in cat2id: + cat2id[cat_name] = len(cat2id) + texts.append([cat_name]) + data_info['texts'] = texts + + instances = [] + for i, ann in enumerate(ann_info): + instance = {} + + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + inter_w = max(0, + min(x1 + w, float(img_info['width'])) - max(x1, 0)) + inter_h = max(0, + min(y1 + h, float(img_info['height'])) - max(y1, 0)) + if inter_w * inter_h == 0: + continue + if ann['area'] <= 0 or w < 1 or h < 1: + continue + bbox = [x1, y1, x1 + w, y1 + h] + + if ann.get('iscrowd', False): + instance['ignore_flag'] = 1 + else: + instance['ignore_flag'] = 0 + instance['bbox'] = bbox + + cat_name = ' '.join([img_info['caption'][t[0]:t[1]] + for t in ann['tokens_positive']]) + instance['bbox_label'] = cat2id[cat_name] + + if ann.get('segmentation', None): + instance['mask'] = ann['segmentation'] + + instances.append(instance) + # NOTE: for detection task, we set `is_detection` to 1 + data_info['is_detection'] = 1 + data_info['instances'] = instances + # print(data_info['texts']) + return data_info + + def filter_data(self) -> List[dict]: + """Filter annotations according to filter_cfg. + + Returns: + List[dict]: Filtered results. + """ + if self.test_mode: + return self.data_list + + if self.filter_cfg is None: + return self.data_list + + filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) + min_size = self.filter_cfg.get('min_size', 0) + + # obtain images that contain annotation + ids_with_ann = set(data_info['img_id'] for data_info in self.data_list) + + valid_data_infos = [] + for i, data_info in enumerate(self.data_list): + img_id = data_info['img_id'] + width = int(data_info['width']) + height = int(data_info['height']) + if filter_empty_gt and img_id not in ids_with_ann: + continue + if min(width, height) >= min_size: + valid_data_infos.append(data_info) + + return valid_data_infos + + def _join_prefix(self): + """Join ``self.data_root`` with ``self.data_prefix`` and + ``self.ann_file``. + """ + # Automatically join annotation file path with `self.root` if + # `self.ann_file` is not an absolute path. + if self.ann_file and not is_abs(self.ann_file) and self.data_root: + self.ann_file = join_path(self.data_root, self.ann_file) + # Automatically join data directory with `self.root` if path value in + # `self.data_prefix` is not an absolute path. + for data_key, prefix in self.data_prefix.items(): + if isinstance(prefix, (list, tuple)): + abs_prefix = [] + for p in prefix: + if not is_abs(p) and self.data_root: + abs_prefix.append(join_path(self.data_root, p)) + else: + abs_prefix.append(p) + self.data_prefix[data_key] = abs_prefix + elif isinstance(prefix, str): + if not is_abs(prefix) and self.data_root: + self.data_prefix[data_key] = join_path( + self.data_root, prefix) + else: + self.data_prefix[data_key] = prefix + else: + raise TypeError('prefix should be a string, tuple or list,' + f'but got {type(prefix)}') diff --git a/models/YOLO-World/yolo_world/datasets/yolov5_obj365v1.py b/models/YOLO-World/yolo_world/datasets/yolov5_obj365v1.py new file mode 100644 index 0000000000000000000000000000000000000000..593dc86b6606eeec7e3fcc3fa00178fd29aad07e --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/yolov5_obj365v1.py @@ -0,0 +1,15 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from mmdet.datasets import Objects365V1Dataset + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from mmyolo.registry import DATASETS + + +@DATASETS.register_module() +class YOLOv5Objects365V1Dataset(BatchShapePolicyDataset, Objects365V1Dataset): + """Dataset for YOLOv5 VOC Dataset. + + We only add `BatchShapePolicy` function compared with Objects365V1Dataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/models/YOLO-World/yolo_world/datasets/yolov5_obj365v2.py b/models/YOLO-World/yolo_world/datasets/yolov5_obj365v2.py new file mode 100644 index 0000000000000000000000000000000000000000..7008565c6bac3813810c025498def528c9a590d6 --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/yolov5_obj365v2.py @@ -0,0 +1,15 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from mmdet.datasets import Objects365V2Dataset + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from mmyolo.registry import DATASETS + + +@DATASETS.register_module() +class YOLOv5Objects365V2Dataset(BatchShapePolicyDataset, Objects365V2Dataset): + """Dataset for YOLOv5 VOC Dataset. + + We only add `BatchShapePolicy` function compared with Objects365V1Dataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/models/YOLO-World/yolo_world/datasets/yolov5_v3det.py b/models/YOLO-World/yolo_world/datasets/yolov5_v3det.py new file mode 100644 index 0000000000000000000000000000000000000000..554a0a33e3a206a5c5ec8314f09d038a50c502ad --- /dev/null +++ b/models/YOLO-World/yolo_world/datasets/yolov5_v3det.py @@ -0,0 +1,110 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import copy +import json +import os.path as osp +from typing import List + +from mmengine.fileio import get_local_path + +from mmdet.datasets.api_wrappers import COCO +from mmdet.datasets import CocoDataset + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from mmyolo.registry import DATASETS + +v3det_ignore_list = [ + 'a00013820/26_275_28143226914_ff3a247c53_c.jpg', + 'n03815615/12_1489_32968099046_be38fa580e_c.jpg', + 'n04550184/19_1480_2504784164_ffa3db8844_c.jpg', + 'a00008703/2_363_3576131784_dfac6fc6ce_c.jpg', + 'n02814533/28_2216_30224383848_a90697f1b3_c.jpg', + 'n12026476/29_186_15091304754_5c219872f7_c.jpg', + 'n01956764/12_2004_50133201066_72e0d9fea5_c.jpg', + 'n03785016/14_2642_518053131_d07abcb5da_c.jpg', + 'a00011156/33_250_4548479728_9ce5246596_c.jpg', + 'a00009461/19_152_2792869324_db95bebc84_c.jpg', +] + +# # ugly code here +# with open(osp.join("data/v3det/cats.json"), 'r') as f: +# _classes = json.load(f)['classes'] + + +@DATASETS.register_module() +class V3DetDataset(CocoDataset): + """Objects365 v1 dataset for detection.""" + + METAINFO = {'classes': 'classes', 'palette': None} + + COCOAPI = COCO + # ann_id is unique in coco dataset. + ANN_ID_UNIQUE = True + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + with get_local_path(self.ann_file, + backend_args=self.backend_args) as local_path: + self.coco = self.COCOAPI(local_path) + + # 'categories' list in objects365_train.json and objects365_val.json + # is inconsistent, need sort list(or dict) before get cat_ids. + cats = self.coco.cats + sorted_cats = {i: cats[i] for i in sorted(cats)} + self.coco.cats = sorted_cats + categories = self.coco.dataset['categories'] + sorted_categories = sorted(categories, key=lambda i: i['id']) + self.coco.dataset['categories'] = sorted_categories + # The order of returned `cat_ids` will not + # change with the order of the `classes` + self.cat_ids = self.coco.get_cat_ids( + cat_names=self.metainfo['classes']) + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.coco.cat_img_map) + + img_ids = self.coco.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + file_name = osp.join( + osp.split(osp.split(raw_img_info['file_name'])[0])[-1], + osp.split(raw_img_info['file_name'])[-1]) + + if file_name in v3det_ignore_list: + continue + + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + + return data_list + + +@DATASETS.register_module() +class YOLOv5V3DetDataset(BatchShapePolicyDataset, V3DetDataset): + """Dataset for YOLOv5 VOC Dataset. + + We only add `BatchShapePolicy` function compared with Objects365V1Dataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/models/YOLO-World/yolo_world/engine/__init__.py b/models/YOLO-World/yolo_world/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..74177cd3c2f867cfa85c41ad6e41a75be478af80 --- /dev/null +++ b/models/YOLO-World/yolo_world/engine/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .optimizers import * # noqa diff --git a/models/YOLO-World/yolo_world/engine/optimizers/__init__.py b/models/YOLO-World/yolo_world/engine/optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..607cefb27435590334926f1521734b1ecadc32ab --- /dev/null +++ b/models/YOLO-World/yolo_world/engine/optimizers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .yolow_v5_optim_constructor import YOLOWv5OptimizerConstructor + +__all__ = ['YOLOWv5OptimizerConstructor'] diff --git a/models/YOLO-World/yolo_world/engine/optimizers/yolow_v5_optim_constructor.py b/models/YOLO-World/yolo_world/engine/optimizers/yolow_v5_optim_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b625ebc9684c4cac2a27383f592a786a5a9e00 --- /dev/null +++ b/models/YOLO-World/yolo_world/engine/optimizers/yolow_v5_optim_constructor.py @@ -0,0 +1,187 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import logging +from typing import List, Optional, Union + +import torch +import torch.nn as nn +from torch.nn import GroupNorm, LayerNorm +from mmengine.dist import get_world_size +from mmengine.logging import print_log +from mmengine.optim import OptimWrapper, DefaultOptimWrapperConstructor +from mmengine.utils.dl_utils import mmcv_full_available +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm + +from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, + OPTIMIZERS) + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class YOLOWv5OptimizerConstructor(DefaultOptimWrapperConstructor): + """YOLO World v5 constructor for optimizers.""" + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None) -> None: + super().__init__(optim_wrapper_cfg, paramwise_cfg) + self.base_total_batch_size = self.paramwise_cfg.pop( + 'base_total_batch_size', 64) + + def add_params(self, + params: List[dict], + module: nn.Module, + prefix: str = '', + is_dcn_module: Optional[Union[int, float]] = None) -> None: + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + + Args: + params (list[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + prefix (str): The prefix of the module + is_dcn_module (int|float|None): If the current module is a + submodule of DCN, `is_dcn_module` will be passed to + control conv_offset layer's learning rate. Defaults to None. + """ + # get param-wise options + custom_keys = self.paramwise_cfg.get('custom_keys', {}) + # first sort with alphabet order and then sort with reversed len of str + sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) + + bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', None) + bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None) + norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None) + dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', None) + flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None) + bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False) + dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', None) + + # special rules for norm layers and depth-wise conv layers + is_norm = isinstance(module, + (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) + is_dwconv = ( + isinstance(module, torch.nn.Conv2d) + and module.in_channels == module.groups) + + for name, param in module.named_parameters(recurse=False): + param_group = {'params': [param]} + if bypass_duplicate and self._is_in(param_group, params): + print_log( + f'{prefix} is duplicate. It is skipped since ' + f'bypass_duplicate={bypass_duplicate}', + logger='current', + level=logging.WARNING) + continue + if not param.requires_grad: + params.append(param_group) + continue + + # if the parameter match one of the custom keys, ignore other rules + for key in sorted_keys: + if key in f'{prefix}.{name}': + lr_mult = custom_keys[key].get('lr_mult', 1.) + param_group['lr'] = self.base_lr * lr_mult + if self.base_wd is not None: + decay_mult = custom_keys[key].get('decay_mult', 1.) + param_group['weight_decay'] = self.base_wd * decay_mult + # add custom settings to param_group + for k, v in custom_keys[key].items(): + param_group[k] = v + break + + # NOTE: the behavious is different from MMDetection + # bias_lr_mult affects all bias parameters + # except for norm.bias dcn.conv_offset.bias + if name == 'bias' and not ( + is_norm or is_dcn_module) and bias_lr_mult is not None: + param_group['lr'] = self.base_lr * bias_lr_mult + + if (prefix.find('conv_offset') != -1 and is_dcn_module + and dcn_offset_lr_mult is not None + and isinstance(module, torch.nn.Conv2d)): + # deal with both dcn_offset's bias & weight + param_group['lr'] = self.base_lr * dcn_offset_lr_mult + + # apply weight decay policies + if self.base_wd is not None: + # norm decay + if is_norm and norm_decay_mult is not None: + param_group[ + 'weight_decay'] = self.base_wd * norm_decay_mult + # bias lr and decay + elif (name == 'bias' and not is_dcn_module + and bias_decay_mult is not None): + param_group[ + 'weight_decay'] = self.base_wd * bias_decay_mult + # depth-wise conv + elif is_dwconv and dwconv_decay_mult is not None: + param_group[ + 'weight_decay'] = self.base_wd * dwconv_decay_mult + # flatten parameters except dcn offset + elif (param.ndim == 1 and not is_dcn_module + and flat_decay_mult is not None): + param_group[ + 'weight_decay'] = self.base_wd * flat_decay_mult + params.append(param_group) + for key, value in param_group.items(): + if key == 'params': + continue + full_name = f'{prefix}.{name}' if prefix else name + print_log( + f'paramwise_options -- {full_name}:{key}={value}', + logger='current') + + if mmcv_full_available(): + from mmcv.ops import DeformConv2d, ModulatedDeformConv2d + is_dcn_module = isinstance(module, + (DeformConv2d, ModulatedDeformConv2d)) + else: + is_dcn_module = False + for child_name, child_mod in module.named_children(): + child_prefix = f'{prefix}.{child_name}' if prefix else child_name + self.add_params( + params, + child_mod, + prefix=child_prefix, + is_dcn_module=is_dcn_module) + + def __call__(self, model: nn.Module) -> OptimWrapper: + if hasattr(model, 'module'): + model = model.module + + optim_wrapper_cfg = self.optim_wrapper_cfg.copy() + optim_wrapper_cfg.setdefault('type', 'OptimWrapper') + optimizer_cfg = self.optimizer_cfg.copy() + + # follow the original yolov5 implementation + if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + + if scale_factor != 1: + weight_decay = optimizer_cfg.get('weight_decay', 0) + weight_decay *= scale_factor + optimizer_cfg['weight_decay'] = weight_decay + print_log(f'Scaled weight_decay to {weight_decay}', 'current') + + # if no paramwise option is specified, just use the global setting + if not self.paramwise_cfg: + optimizer_cfg['params'] = model.parameters() + optimizer = OPTIMIZERS.build(optimizer_cfg) + else: + # set param-wise lr and weight decay recursively + params: List = [] + self.add_params(params, model) + optimizer_cfg['params'] = params + optimizer = OPTIMIZERS.build(optimizer_cfg) + optim_wrapper = OPTIM_WRAPPERS.build( + optim_wrapper_cfg, default_args=dict(optimizer=optimizer)) + return optim_wrapper diff --git a/models/YOLO-World/yolo_world/models/__init__.py b/models/YOLO-World/yolo_world/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..98bbeaef134ba04bede5d409537d05c6616cb8f0 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .backbones import * # noqa +from .layers import * # noqa +from .detectors import * # noqa +from .losses import * # noqa +from .data_preprocessors import * # noqa +from .dense_heads import * # noqa +from .necks import * # noqa +from .assigner import * # noqa diff --git a/models/YOLO-World/yolo_world/models/assigner/__init__.py b/models/YOLO-World/yolo_world/models/assigner/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..320009790289c81a43c57a93a12ce1896963a6b6 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/assigner/__init__.py @@ -0,0 +1,3 @@ +from .task_aligned_assigner import YOLOWorldSegAssigner + +__all__ = ['YOLOWorldSegAssigner'] \ No newline at end of file diff --git a/models/YOLO-World/yolo_world/models/assigner/task_aligned_assigner.py b/models/YOLO-World/yolo_world/models/assigner/task_aligned_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..a6f0d242844eb703da88fd5c74c215e29f017d35 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/assigner/task_aligned_assigner.py @@ -0,0 +1,108 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import torch +from torch import Tensor +from mmyolo.registry import TASK_UTILS +from mmyolo.models.task_modules.assigners import BatchTaskAlignedAssigner +from mmyolo.models.task_modules.assigners.utils import select_highest_overlaps + +@TASK_UTILS.register_module() +class YOLOWorldSegAssigner(BatchTaskAlignedAssigner): + + def __init__(self, + num_classes: int, + topk: int = 13, + alpha: float = 1, + beta: float = 6, + eps: float = 1e-7, + use_ciou: bool = False): + super().__init__(num_classes, topk, alpha, beta, eps, use_ciou) + + @torch.no_grad() + def forward( + self, + pred_bboxes: Tensor, + pred_scores: Tensor, + priors: Tensor, + gt_labels: Tensor, + gt_bboxes: Tensor, + pad_bbox_flag: Tensor, + ) -> dict: + """Assign gt to bboxes. + + The assignment is done in following steps + 1. compute alignment metric between all bbox (bbox of all pyramid + levels) and gt + 2. select top-k bbox as candidates for each gt + 3. limit the positive sample's center in gt (because the anchor-free + detector only can predict positive distance) + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bboxes, + shape(batch_size, num_priors, num_classes) + priors (Tensor): Model priors, shape (num_priors, 4) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + Returns: + assigned_result (dict) Assigned result: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned boxes, + shape(batch_size, num_priors, 4) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors, num_classes) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + """ + # (num_priors, 4) -> (num_priors, 2) + priors = priors[:, :2] + + batch_size = pred_scores.size(0) + num_gt = gt_bboxes.size(1) + + assigned_result = { + 'assigned_labels': + gt_bboxes.new_full(pred_scores[..., 0].shape, self.num_classes), + 'assigned_bboxes': + gt_bboxes.new_full(pred_bboxes.shape, 0), + 'assigned_scores': + gt_bboxes.new_full(pred_scores.shape, 0), + 'fg_mask_pre_prior': + gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + } + + if num_gt == 0: + return assigned_result + + pos_mask, alignment_metrics, overlaps = self.get_pos_mask( + pred_bboxes, pred_scores, priors, gt_labels, gt_bboxes, + pad_bbox_flag, batch_size, num_gt) + + (assigned_gt_idxs, fg_mask_pre_prior, + pos_mask) = select_highest_overlaps(pos_mask, overlaps, num_gt) + + # assigned target + assigned_labels, assigned_bboxes, assigned_scores = self.get_targets( + gt_labels, gt_bboxes, assigned_gt_idxs, fg_mask_pre_prior, + batch_size, num_gt) + + # normalize + alignment_metrics *= pos_mask + pos_align_metrics = alignment_metrics.max(axis=-1, keepdim=True)[0] + pos_overlaps = (overlaps * pos_mask).max(axis=-1, keepdim=True)[0] + norm_align_metric = ( + alignment_metrics * pos_overlaps / + (pos_align_metrics + self.eps)).max(-2)[0].unsqueeze(-1) + assigned_scores = assigned_scores * norm_align_metric + + assigned_result['assigned_labels'] = assigned_labels + assigned_result['assigned_bboxes'] = assigned_bboxes + assigned_result['assigned_scores'] = assigned_scores + assigned_result['fg_mask_pre_prior'] = fg_mask_pre_prior.bool() + assigned_result['assigned_gt_idxs'] = assigned_gt_idxs + return assigned_result diff --git a/models/YOLO-World/yolo_world/models/backbones/__init__.py b/models/YOLO-World/yolo_world/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..67698adfe7d6efe0beef29127ce7f34e9aa573ba --- /dev/null +++ b/models/YOLO-World/yolo_world/models/backbones/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Tencent Inc. All rights reserved. +# YOLO Multi-Modal Backbone (Vision Language) +# Vision: YOLOv8 CSPDarknet +# Language: CLIP Text Encoder (12-layer transformer) +from .mm_backbone import ( + MultiModalYOLOBackbone, + HuggingVisionBackbone, + HuggingCLIPLanguageBackbone, + PseudoLanguageBackbone) + +__all__ = [ + 'MultiModalYOLOBackbone', + 'HuggingVisionBackbone', + 'HuggingCLIPLanguageBackbone', + 'PseudoLanguageBackbone' +] diff --git a/models/YOLO-World/yolo_world/models/backbones/mm_backbone.py b/models/YOLO-World/yolo_world/models/backbones/mm_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..37dab1537080f8fa573c5023be22e0926e730950 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/backbones/mm_backbone.py @@ -0,0 +1,227 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import itertools +from typing import List, Sequence, Tuple +import torch +from torch import Tensor +from torch.nn.modules.batchnorm import _BatchNorm +from mmengine.model import BaseModule +from mmyolo.registry import MODELS +from mmdet.utils import OptMultiConfig, ConfigType +from transformers import (AutoTokenizer, AutoModel, CLIPTextConfig) +from transformers import CLIPTextModelWithProjection as CLIPTP + + +@MODELS.register_module() +class HuggingVisionBackbone(BaseModule): + + def __init__(self, + model_name: str, + out_indices: Sequence[int] = (0, 1, 2, 3), + norm_eval: bool = True, + frozen_modules: Sequence[str] = (), + init_cfg: OptMultiConfig = None) -> None: + + super().__init__(init_cfg=init_cfg) + + self.norm_eval = norm_eval + self.frozen_modules = frozen_modules + self.model = AutoModel.from_pretrained(model_name) + + self._freeze_modules() + + def forward(self, image: Tensor) -> Tuple[Tensor]: + encoded_dict = self.image_model(pixel_values=image, + output_hidden_states=True) + hidden_states = encoded_dict.hidden_states + img_feats = encoded_dict.get('reshaped_hidden_states', hidden_states) + img_feats = [img_feats[i] for i in self.image_out_indices] + return tuple(img_feats) + + def _freeze_modules(self): + for name, module in self.model.named_modules(): + for frozen_name in self.frozen_modules: + if name.startswith(frozen_name): + module.eval() + for param in module.parameters(): + param.requires_grad = False + break + + def train(self, mode=True): + super().train(mode) + self._freeze_modules() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + +@MODELS.register_module() +class HuggingCLIPLanguageBackbone(BaseModule): + + def __init__(self, + model_name: str, + frozen_modules: Sequence[str] = (), + dropout: float = 0.0, + training_use_cache: bool = False, + init_cfg: OptMultiConfig = None) -> None: + + super().__init__(init_cfg=init_cfg) + + self.frozen_modules = frozen_modules + self.training_use_cache = training_use_cache + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + clip_config = CLIPTextConfig.from_pretrained(model_name, + attention_dropout=dropout) + self.model = CLIPTP.from_pretrained(model_name, config=clip_config) + self._freeze_modules() + + def forward_tokenizer(self, texts): + if not hasattr(self, 'text'): + text = list(itertools.chain(*texts)) + text = self.tokenizer(text=text, return_tensors='pt', padding=True) + self.text = text.to(device=self.model.device) + return self.text + + def forward(self, text: List[List[str]]) -> Tensor: + num_per_batch = [len(t) for t in text] + assert max(num_per_batch) == min(num_per_batch), ( + 'number of sequences not equal in batch') + text = list(itertools.chain(*text)) + text = self.tokenizer(text=text, return_tensors='pt', padding=True) + text = text.to(device=self.model.device) + txt_outputs = self.model(**text) + txt_feats = txt_outputs.text_embeds + txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) + txt_feats = txt_feats.reshape(-1, num_per_batch[0], + txt_feats.shape[-1]) + return txt_feats + + def _freeze_modules(self): + + if len(self.frozen_modules) == 0: + # not freeze + return + if self.frozen_modules[0] == "all": + self.model.eval() + for _, module in self.model.named_modules(): + module.eval() + for param in module.parameters(): + param.requires_grad = False + return + for name, module in self.model.named_modules(): + for frozen_name in self.frozen_modules: + if name.startswith(frozen_name): + module.eval() + for param in module.parameters(): + param.requires_grad = False + break + + def train(self, mode=True): + super().train(mode) + self._freeze_modules() + + +@MODELS.register_module() +class PseudoLanguageBackbone(BaseModule): + """Pseudo Language Backbone + Args: + text_embed_path (str): path to the text embedding file + """ + + def __init__(self, + text_embed_path: str = "", + test_embed_path: str = None, + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + # {text:embed} + self.text_embed = torch.load(text_embed_path, map_location='cpu') + if test_embed_path is None: + self.test_embed = self.text_embed + else: + self.test_embed = torch.load(test_embed_path) + self.register_buffer("buff", torch.zeros([ + 1, + ])) + + def forward_cache(self, text: List[List[str]]) -> Tensor: + if not hasattr(self, "cache"): + self.cache = self.forward_text(text) + return self.cache + + def forward(self, text: List[List[str]]) -> Tensor: + if self.training: + return self.forward_text(text) + else: + return self.forward_cache(text) + + def forward_text(self, text: List[List[str]]) -> Tensor: + num_per_batch = [len(t) for t in text] + assert max(num_per_batch) == min(num_per_batch), ( + 'number of sequences not equal in batch') + text = list(itertools.chain(*text)) + if self.training: + text_embed_dict = self.text_embed + else: + text_embed_dict = self.test_embed + text_embeds = torch.stack( + [text_embed_dict[x.split("/")[0]] for x in text]) + # requires no grad and force to float + text_embeds = text_embeds.to( + self.buff.device).requires_grad_(False).float() + text_embeds = text_embeds.reshape(-1, num_per_batch[0], + text_embeds.shape[-1]) + return text_embeds + + +@MODELS.register_module() +class MultiModalYOLOBackbone(BaseModule): + + def __init__(self, + image_model: ConfigType, + text_model: ConfigType, + frozen_stages: int = -1, + with_text_model: bool = True, + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg) + self.with_text_model = with_text_model + self.image_model = MODELS.build(image_model) + if self.with_text_model: + self.text_model = MODELS.build(text_model) + else: + self.text_model = None + self.frozen_stages = frozen_stages + self._freeze_stages() + + def _freeze_stages(self): + """Freeze the parameters of the specified stage so that they are no + longer updated.""" + if self.frozen_stages >= 0: + for i in range(self.frozen_stages + 1): + m = getattr(self.image_model, self.image_model.layers[i]) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode: bool = True): + """Convert the model into training mode while keep normalization layer + frozen.""" + super().train(mode) + self._freeze_stages() + + def forward(self, image: Tensor, + text: List[List[str]]) -> Tuple[Tuple[Tensor], Tensor]: + img_feats = self.image_model(image) + if self.with_text_model: + txt_feats = self.text_model(text) + return img_feats, txt_feats + else: + return img_feats, None + + def forward_text(self, text: List[List[str]]) -> Tensor: + assert self.with_text_model, "forward_text() requires a text model" + txt_feats = self.text_model(text) + return txt_feats + + def forward_image(self, image: Tensor) -> Tuple[Tensor]: + return self.image_model(image) diff --git a/models/YOLO-World/yolo_world/models/data_preprocessors/__init__.py b/models/YOLO-World/yolo_world/models/data_preprocessors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e3959ac60693349c2ecd1a659aa0ca32c00c7eae --- /dev/null +++ b/models/YOLO-World/yolo_world/models/data_preprocessors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .data_preprocessor import YOLOWDetDataPreprocessor + +__all__ = ['YOLOWDetDataPreprocessor'] diff --git a/models/YOLO-World/yolo_world/models/data_preprocessors/data_preprocessor.py b/models/YOLO-World/yolo_world/models/data_preprocessors/data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..58787063c8da3cd654c6e33eb81919a106273ab9 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/data_preprocessors/data_preprocessor.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +import torch +from mmdet.models.data_preprocessors import DetDataPreprocessor +from mmengine.structures import BaseDataElement + +from mmyolo.registry import MODELS + +CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str, + None] + + +@MODELS.register_module() +class YOLOWDetDataPreprocessor(DetDataPreprocessor): + """Rewrite collate_fn to get faster training speed. + + Note: It must be used together with `mmyolo.datasets.utils.yolow_collate` + """ + + def __init__(self, *args, non_blocking: Optional[bool] = True, **kwargs): + super().__init__(*args, non_blocking=non_blocking, **kwargs) + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization, padding and bgr2rgb conversion based on + ``DetDataPreprocessorr``. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # TODO: Supports multi-scale training + if self._channel_conversion and inputs.shape[1] == 3: + inputs = inputs[:, [2, 1, 0], ...] + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(inputs, data_samples) + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples_output = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'texts': data_samples['texts'], + 'img_metas': img_metas + } + if 'masks' in data_samples: + data_samples_output['masks'] = data_samples['masks'] + if 'is_detection' in data_samples: + data_samples_output['is_detection'] = data_samples['is_detection'] + + return {'inputs': inputs, 'data_samples': data_samples_output} diff --git a/models/YOLO-World/yolo_world/models/dense_heads/__init__.py b/models/YOLO-World/yolo_world/models/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b6dc1be66225281d69b4a4bd739ac309db53e5a --- /dev/null +++ b/models/YOLO-World/yolo_world/models/dense_heads/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .yolo_world_head import YOLOWorldHead, YOLOWorldHeadModule, RepYOLOWorldHeadModule +from .yolo_world_seg_head import YOLOWorldSegHead, YOLOWorldSegHeadModule + +__all__ = [ + 'YOLOWorldHead', 'YOLOWorldHeadModule', 'YOLOWorldSegHead', + 'YOLOWorldSegHeadModule', 'RepYOLOWorldHeadModule' +] diff --git a/models/YOLO-World/yolo_world/models/dense_heads/yolo_world_head.py b/models/YOLO-World/yolo_world/models/dense_heads/yolo_world_head.py new file mode 100644 index 0000000000000000000000000000000000000000..45fde3e84f527e9f681ea3539900d1156df8086d --- /dev/null +++ b/models/YOLO-World/yolo_world/models/dense_heads/yolo_world_head.py @@ -0,0 +1,734 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import math +import copy +from typing import List, Optional, Tuple, Union, Sequence +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from mmcv.cnn import ConvModule +from mmengine.config import ConfigDict +from mmengine.model import BaseModule +from torch import Tensor +from torch.nn.modules.batchnorm import _BatchNorm + +from mmengine.dist import get_dist_info +from mmengine.structures import InstanceData +from mmdet.structures import SampleList +from mmdet.utils import OptConfigType, InstanceList, OptInstanceList +from mmdet.models.utils import (multi_apply, unpack_gt_instances, + filter_scores_and_topk) +from mmyolo.registry import MODELS +from mmyolo.models.dense_heads import YOLOv8HeadModule, YOLOv8Head +from mmyolo.models.utils import gt_instances_preprocess +from mmcv.cnn.bricks import build_norm_layer + + +@MODELS.register_module() +class ContrastiveHead(BaseModule): + """Contrastive Head for YOLO-World + compute the region-text scores according to the + similarity between image and text features + Args: + embed_dims (int): embed dim of text and image features + """ + + def __init__(self, + embed_dims: int, + init_cfg: OptConfigType = None, + use_einsum: bool = True) -> None: + + super().__init__(init_cfg=init_cfg) + + self.bias = nn.Parameter(torch.zeros([])) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + self.use_einsum = use_einsum + + def forward(self, x: Tensor, w: Tensor) -> Tensor: + """Forward function of contrastive learning.""" + x = F.normalize(x, dim=1, p=2) + w = F.normalize(w, dim=-1, p=2) + + if self.use_einsum: + x = torch.einsum('bchw,bkc->bkhw', x, w) + else: + batch, channel, height, width = x.shape + _, k, _ = w.shape + x = x.permute(0, 2, 3, 1) # bchw->bhwc + x = x.reshape(batch, -1, channel) # bhwc->b(hw)c + w = w.permute(0, 2, 1) # bkc->bck + x = torch.matmul(x, w) + x = x.reshape(batch, height, width, k) + x = x.permute(0, 3, 1, 2) + + x = x * self.logit_scale.exp() + self.bias + return x + + +@MODELS.register_module() +class BNContrastiveHead(BaseModule): + """ Batch Norm Contrastive Head for YOLO-World + using batch norm instead of l2-normalization + Args: + embed_dims (int): embed dim of text and image features + norm_cfg (dict): normalization params + """ + + def __init__(self, + embed_dims: int, + norm_cfg: ConfigDict, + init_cfg: OptConfigType = None, + use_einsum: bool = True) -> None: + + super().__init__(init_cfg=init_cfg) + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + self.bias = nn.Parameter(torch.zeros([])) + # use -1.0 is more stable + self.logit_scale = nn.Parameter(-1.0 * torch.ones([])) + self.use_einsum = use_einsum + + def forward(self, x: Tensor, w: Tensor) -> Tensor: + """Forward function of contrastive learning.""" + x = self.norm(x) + w = F.normalize(w, dim=-1, p=2) + + if self.use_einsum: + x = torch.einsum('bchw,bkc->bkhw', x, w) + else: + batch, channel, height, width = x.shape + _, k, _ = w.shape + x = x.permute(0, 2, 3, 1) # bchw->bhwc + x = x.reshape(batch, -1, channel) # bhwc->b(hw)c + w = w.permute(0, 2, 1) # bkc->bck + x = torch.matmul(x, w) + x = x.reshape(batch, height, width, k) + x = x.permute(0, 3, 1, 2) + + x = x * self.logit_scale.exp() + self.bias + return x + + +@MODELS.register_module() +class RepBNContrastiveHead(BaseModule): + """ Batch Norm Contrastive Head for YOLO-World + using batch norm instead of l2-normalization + Args: + embed_dims (int): embed dim of text and image features + norm_cfg (dict): normalization params + """ + + def __init__(self, + embed_dims: int, + num_guide_embeds: int, + norm_cfg: ConfigDict, + init_cfg: OptConfigType = None) -> None: + + super().__init__(init_cfg=init_cfg) + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + self.conv = nn.Conv2d(embed_dims, num_guide_embeds, kernel_size=1) + + def forward(self, x: Tensor) -> Tensor: + """Forward function of contrastive learning.""" + x = self.norm(x) + x = self.conv(x) + return x + + +@MODELS.register_module() +class YOLOWorldHeadModule(YOLOv8HeadModule): + """Head Module for YOLO-World + + Args: + embed_dims (int): embed dim for text feautures and image features + use_bn_head (bool): use batch normalization head + """ + + def __init__(self, + *args, + embed_dims: int, + use_bn_head: bool = False, + use_einsum: bool = True, + freeze_all: bool = False, + **kwargs) -> None: + self.embed_dims = embed_dims + self.use_bn_head = use_bn_head + self.use_einsum = use_einsum + self.freeze_all = freeze_all + super().__init__(*args, **kwargs) + + def init_weights(self, prior_prob=0.01): + """Initialize the weight and bias of PPYOLOE head.""" + super().init_weights() + for cls_pred, cls_contrast, stride in zip(self.cls_preds, + self.cls_contrasts, + self.featmap_strides): + cls_pred[-1].bias.data[:] = 0.0 # reset bias + if hasattr(cls_contrast, 'bias'): + nn.init.constant_( + cls_contrast.bias.data, + math.log(5 / self.num_classes / (640 / stride)**2)) + + def _init_layers(self) -> None: + """initialize conv layers in YOLOv8 head.""" + # Init decouple head + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.cls_contrasts = nn.ModuleList() + + reg_out_channels = max( + (16, self.in_channels[0] // 4, self.reg_max * 4)) + cls_out_channels = max(self.in_channels[0], self.num_classes) + + for i in range(self.num_levels): + self.reg_preds.append( + nn.Sequential( + ConvModule(in_channels=self.in_channels[i], + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule(in_channels=reg_out_channels, + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(in_channels=reg_out_channels, + out_channels=4 * self.reg_max, + kernel_size=1))) + self.cls_preds.append( + nn.Sequential( + ConvModule(in_channels=self.in_channels[i], + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule(in_channels=cls_out_channels, + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(in_channels=cls_out_channels, + out_channels=self.embed_dims, + kernel_size=1))) + if self.use_bn_head: + self.cls_contrasts.append( + BNContrastiveHead(self.embed_dims, + self.norm_cfg, + use_einsum=self.use_einsum)) + else: + self.cls_contrasts.append( + ContrastiveHead(self.embed_dims, + use_einsum=self.use_einsum)) + + proj = torch.arange(self.reg_max, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + if self.freeze_all: + self._freeze_all() + + def _freeze_all(self): + """Freeze the model.""" + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super().train(mode) + if self.freeze_all: + self._freeze_all() + + def forward(self, img_feats: Tuple[Tensor], + txt_feats: Tensor) -> Tuple[List]: + """Forward features from the upstream network.""" + assert len(img_feats) == self.num_levels + txt_feats = [txt_feats for _ in range(self.num_levels)] + return multi_apply(self.forward_single, img_feats, txt_feats, + self.cls_preds, self.reg_preds, self.cls_contrasts) + + def forward_single(self, img_feat: Tensor, txt_feat: Tensor, + cls_pred: nn.ModuleList, reg_pred: nn.ModuleList, + cls_contrast: nn.ModuleList) -> Tuple: + """Forward feature of a single scale level.""" + b, _, h, w = img_feat.shape + cls_embed = cls_pred(img_feat) + cls_logit = cls_contrast(cls_embed, txt_feat) + bbox_dist_preds = reg_pred(img_feat) + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2) + + # TODO: The get_flops script cannot handle the situation of + # matmul, and needs to be fixed later + # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds + else: + return cls_logit, bbox_preds + +@MODELS.register_module() +class RepYOLOWorldHeadModule(YOLOWorldHeadModule): + + def __init__(self, + *args, + embed_dims: int, + num_guide: int, + freeze_all: bool = False, + **kwargs) -> None: + super().__init__(*args, + embed_dims=embed_dims, + use_bn_head=True, + use_einsum=False, + freeze_all=freeze_all, + **kwargs) + + # using rep head + cls_contrasts = [] + for _ in range(self.num_levels): + cls_contrasts.append( + RepBNContrastiveHead( + embed_dims=embed_dims, + num_guide_embeds=num_guide, + norm_cfg=self.norm_cfg + ) + ) + self.cls_contrasts = nn.ModuleList(cls_contrasts) + + def forward_single(self, img_feat: Tensor, cls_pred: nn.ModuleList, + reg_pred: nn.ModuleList, + cls_contrast: nn.ModuleList) -> Tuple: + """Forward features from the upstream network.""" + b, _, h, w = img_feat.shape + cls_embed = cls_pred(img_feat) + cls_logit = cls_contrast(cls_embed) + bbox_dist_preds = reg_pred(img_feat) + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2) + + # TODO: The get_flops script cannot handle the situation of + # matmul, and needs to be fixed later + # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds + else: + return cls_logit, bbox_preds + + def forward(self, img_feats: Tuple[Tensor]) -> Tuple[List]: + assert len(img_feats) == self.num_levels + return multi_apply(self.forward_single, img_feats, self.cls_preds, + self.reg_preds, self.cls_contrasts) + + +@MODELS.register_module() +class YOLOWorldHead(YOLOv8Head): + """YOLO-World Head + """ + + def __init__(self, world_size=-1, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.world_size = world_size + + """YOLO World v8 head.""" + + def loss(self, img_feats: Tuple[Tensor], txt_feats: Tensor, + batch_data_samples: Union[list, dict]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network.""" + + outs = self(img_feats, txt_feats) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_and_predict( + self, + img_feats: Tuple[Tensor], + txt_feats: Tensor, + batch_data_samples: SampleList, + proposal_cfg: Optional[ConfigDict] = None + ) -> Tuple[dict, InstanceList]: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples. + """ + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + outs = self(img_feats, txt_feats) + + loss_inputs = outs + (batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + losses = self.loss_by_feat(*loss_inputs) + + predictions = self.predict_by_feat(*outs, + batch_img_metas=batch_img_metas, + cfg=proposal_cfg) + return losses, predictions + + def forward(self, img_feats: Tuple[Tensor], + txt_feats: Tensor) -> Tuple[List]: + """Forward features from the upstream network.""" + return self.head_module(img_feats, txt_feats) + + def predict(self, + img_feats: Tuple[Tensor], + txt_feats: Tensor, + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + outs = self(img_feats, txt_feats) + predictions = self.predict_by_feat(*outs, + batch_img_metas=batch_img_metas, + rescale=rescale) + return predictions + + def aug_test(self, + aug_batch_feats, + aug_batch_img_metas, + rescale=False, + with_ori_nms=False, + **kwargs): + """Test function with test time augmentation.""" + raise NotImplementedError('aug_test is not implemented yet.') + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat(mlvl_priors_with_stride, + dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, n, 4 * reg_max) + flatten_pred_dists = [ + bbox_pred_org.reshape(num_imgs, -1, self.head_module.reg_max * 4) + for bbox_pred_org in bbox_dist_preds + ] + + flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1) + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + + assigned_result = self.assigner( + (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype), + flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + assigned_scores_sum = assigned_scores.sum().clamp(min=1) + + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores).sum() + loss_cls /= assigned_scores_sum + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select(assigned_scores.sum(-1), + fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, assigned_bboxes_pos, + weight=bbox_weight) / assigned_scores_sum + + # dfl loss + pred_dist_pos = flatten_dist_preds[fg_mask_pre_prior] + assigned_ltrb = self.bbox_coder.encode( + self.flatten_priors_train[..., :2] / self.stride_tensor, + assigned_bboxes, + max_dis=self.head_module.reg_max - 1, + eps=0.01) + assigned_ltrb_pos = torch.masked_select( + assigned_ltrb, prior_bbox_mask).reshape([-1, 4]) + loss_dfl = self.loss_dfl(pred_dist_pos.reshape( + -1, self.head_module.reg_max), + assigned_ltrb_pos.reshape(-1), + weight=bbox_weight.expand(-1, + 4).reshape(-1), + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + loss_dfl = flatten_pred_bboxes.sum() * 0 + if self.world_size == -1: + _, world_size = get_dist_info() + else: + world_size = self.world_size + return dict(loss_cls=loss_cls * num_imgs * world_size, + loss_bbox=loss_bbox * num_imgs * world_size, + loss_dfl=loss_dfl * num_imgs * world_size) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into + bbox results. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_bbox_preds, flatten_stride) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(num_imgs)] + # 8400 + # print(flatten_cls_scores.shape) + results_list = [] + for (bboxes, scores, objectness, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, batch_img_metas): + ori_shape = img_meta['ori_shape'] + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0])) + labels = results['labels'] + else: + scores, labels, keep_idxs, _ = filter_scores_and_topk( + scores, score_thr, nms_pre) + + results = InstanceData(scores=scores, + labels=labels, + bboxes=bboxes[keep_idxs]) + + if rescale: + if pad_param is not None: + results.bboxes -= results.bboxes.new_tensor([ + pad_param[2], pad_param[0], pad_param[2], pad_param[0] + ]) + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process(results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results_list.append(results) + return results_list diff --git a/models/YOLO-World/yolo_world/models/dense_heads/yolo_world_seg_head.py b/models/YOLO-World/yolo_world/models/dense_heads/yolo_world_seg_head.py new file mode 100644 index 0000000000000000000000000000000000000000..c79f1cc5561c1f550b1f89b94b6c7529c42e5619 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/dense_heads/yolo_world_seg_head.py @@ -0,0 +1,550 @@ +# Copyright (c) Lin Song. All rights reserved. +import math +from typing import List, Optional, Tuple, Union, Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.nn.modules.batchnorm import _BatchNorm + +from mmcv.cnn import ConvModule +from mmengine.config import ConfigDict +from mmengine.dist import get_dist_info +from mmengine.structures import InstanceData +from mmdet.structures import SampleList +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig, InstanceList) +from mmdet.models.utils import multi_apply, unpack_gt_instances +from mmyolo.models.dense_heads import YOLOv8HeadModule +from mmyolo.models.utils import gt_instances_preprocess +from mmyolo.registry import MODELS, TASK_UTILS +from mmyolo.models.dense_heads.yolov5_ins_head import ( + ProtoModule, YOLOv5InsHead +) + +from .yolo_world_head import ContrastiveHead, BNContrastiveHead + + +@MODELS.register_module() +class YOLOWorldSegHeadModule(YOLOv8HeadModule): + def __init__(self, + *args, + embed_dims: int, + proto_channels: int, + mask_channels: int, + freeze_bbox: bool = False, + freeze_all: bool = False, + use_bn_head: bool = False, + **kwargs) -> None: + self.embed_dims = embed_dims + self.proto_channels = proto_channels + self.mask_channels = mask_channels + self.freeze_bbox = freeze_bbox + self.freeze_all = freeze_all + self.use_bn_head = use_bn_head + super().__init__(*args, **kwargs) + + def init_weights(self, prior_prob=0.01): + """Initialize the weight and bias of PPYOLOE head.""" + super().init_weights() + for cls_pred, cls_contrast, stride in zip(self.cls_preds, + self.cls_contrasts, + self.featmap_strides): + cls_pred[-1].bias.data[:] = 0.0 # reset bias + if hasattr(cls_contrast, 'bias'): + nn.init.constant_( + cls_contrast.bias.data, + math.log(5 / self.num_classes / (640 / stride)**2)) + + def _init_layers(self) -> None: + """initialize conv layers in YOLOv8 head.""" + # Init decouple head + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.seg_preds = nn.ModuleList() + self.cls_contrasts = nn.ModuleList() + + reg_out_channels = max( + (16, self.in_channels[0] // 4, self.reg_max * 4)) + seg_out_channels = max(self.in_channels[0] // 4, self.mask_channels) + cls_out_channels = max(self.in_channels[0], self.num_classes) + + bbox_norm_cfg = self.norm_cfg + bbox_norm_cfg['requires_grad'] = not self.freeze_bbox + if self.freeze_all: + self.norm_cfg['requires_grad'] = False + bbox_norm_cfg['requires_grad'] = False + + for i in range(self.num_levels): + self.reg_preds.append( + nn.Sequential( + ConvModule(in_channels=self.in_channels[i], + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=bbox_norm_cfg, + act_cfg=self.act_cfg), + ConvModule(in_channels=reg_out_channels, + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=bbox_norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(in_channels=reg_out_channels, + out_channels=4 * self.reg_max, + kernel_size=1))) + self.cls_preds.append( + nn.Sequential( + ConvModule(in_channels=self.in_channels[i], + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=bbox_norm_cfg, + act_cfg=self.act_cfg), + ConvModule(in_channels=cls_out_channels, + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=bbox_norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(in_channels=cls_out_channels, + out_channels=self.embed_dims, + kernel_size=1))) + self.seg_preds.append( + nn.Sequential( + ConvModule(in_channels=self.in_channels[i], + out_channels=seg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule(in_channels=seg_out_channels, + out_channels=seg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(in_channels=seg_out_channels, + out_channels=self.mask_channels, + kernel_size=1))) + + if self.use_bn_head: + self.cls_contrasts.append( + BNContrastiveHead(self.embed_dims, self.norm_cfg)) + else: + self.cls_contrasts.append(ContrastiveHead(self.embed_dims)) + + proj = torch.arange(self.reg_max, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + self.proto_pred = ProtoModule(in_channels=self.in_channels[0], + middle_channels=self.proto_channels, + mask_channels=self.mask_channels, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if self.freeze_bbox or self.freeze_bbox: + self._freeze_all() + + def _freeze_all(self): + frozen_list = [self.cls_preds, self.reg_preds, self.cls_contrasts] + if self.freeze_all: + frozen_list.extend([self.proto_pred, self.seg_preds]) + for module in frozen_list: + for m in module.modules(): + if isinstance(m, _BatchNorm): + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode: bool = True): + """Convert the model into training mode while keep normalization layer + frozen.""" + super().train(mode) + if self.freeze_bbox or self.freeze_all: + self._freeze_all() + + def forward(self, img_feats: Tuple[Tensor], + txt_feats: Tensor) -> Tuple[List]: + """Forward features from the upstream network.""" + assert len(img_feats) == self.num_levels + txt_feats = [txt_feats for _ in range(self.num_levels)] + mask_protos = self.proto_pred(img_feats[0]) + cls_logit, bbox_preds, bbox_dist_preds, coeff_preds = multi_apply( + self.forward_single, img_feats, txt_feats, self.cls_preds, + self.reg_preds, self.cls_contrasts, self.seg_preds) + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds, coeff_preds, mask_protos + else: + return cls_logit, bbox_preds, None, coeff_preds, mask_protos + + def forward_single(self, img_feat: Tensor, txt_feat: Tensor, + cls_pred: nn.ModuleList, reg_pred: nn.ModuleList, + cls_contrast: nn.ModuleList, + seg_pred: nn.ModuleList) -> Tuple: + """Forward feature of a single scale level.""" + b, _, h, w = img_feat.shape + cls_embed = cls_pred(img_feat) + cls_logit = cls_contrast(cls_embed, txt_feat) + bbox_dist_preds = reg_pred(img_feat) + coeff_pred = seg_pred(img_feat) + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2) + + # TODO: The get_flops script cannot handle the situation of + # matmul, and needs to be fixed later + # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds, coeff_pred + else: + return cls_logit, bbox_preds, None, coeff_pred + + +@MODELS.register_module() +class YOLOWorldSegHead(YOLOv5InsHead): + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict(type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none', + loss_weight=0.5), + loss_bbox: ConfigType = dict(type='IoULoss', + iou_mode='ciou', + bbox_format='xyxy', + reduction='sum', + loss_weight=7.5, + return_iou=False), + loss_dfl=dict(type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=1.5 / 4), + mask_overlap: bool = True, + loss_mask: ConfigType = dict(type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=0.05, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + self.loss_dfl = MODELS.build(loss_dfl) + self.loss_obj = None + self.mask_overlap = mask_overlap + self.loss_mask: nn.Module = MODELS.build(loss_mask) + self.loss_mask_weight = loss_mask_weight + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + # Add common attributes to reduce calculation + self.featmap_sizes_train = None + self.num_level_priors = None + self.flatten_priors_train = None + self.stride_tensor = None + + """YOLO World head.""" + + def loss(self, img_feats: Tuple[Tensor], txt_feats: Tensor, + batch_data_samples: Union[list, dict]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network.""" + + outs = self(img_feats, txt_feats) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['masks'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_and_predict( + self, + img_feats: Tuple[Tensor], + txt_feats: Tensor, + batch_data_samples: SampleList, + proposal_cfg: Optional[ConfigDict] = None + ) -> Tuple[dict, InstanceList]: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples. + """ + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + outs = self(img_feats, txt_feats) + + loss_inputs = outs + (batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + losses = self.loss_by_feat(*loss_inputs) + + predictions = self.predict_by_feat(*outs, + batch_img_metas=batch_img_metas, + cfg=proposal_cfg) + return losses, predictions + + def forward(self, img_feats: Tuple[Tensor], + txt_feats: Tensor) -> Tuple[List]: + """Forward features from the upstream network.""" + return self.head_module(img_feats, txt_feats) + + def predict(self, + img_feats: Tuple[Tensor], + txt_feats: Tensor, + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + outs = self(img_feats, txt_feats) + predictions = self.predict_by_feat(*outs, + batch_img_metas=batch_img_metas, + rescale=rescale) + return predictions + + def aug_test(self, + aug_batch_feats, + aug_batch_img_metas, + rescale=False, + with_ori_nms=False, + **kwargs): + """Test function with test time augmentation.""" + raise NotImplementedError('aug_test is not implemented yet.') + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + coeff_preds: Sequence[Tensor], + proto_preds: Tensor, + batch_gt_instances: Sequence[InstanceData], + batch_gt_masks: Sequence[Tensor], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat(mlvl_priors_with_stride, + dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, n, 4 * reg_max) + flatten_pred_dists = [ + bbox_pred_org.reshape(num_imgs, -1, self.head_module.reg_max * 4) + for bbox_pred_org in bbox_dist_preds + ] + + flatten_pred_coeffs = [ + coeff_pred.permute(0, 2, 3, + 1).reshape(num_imgs, -1, + self.head_module.mask_channels) + for coeff_pred in coeff_preds + ] + + flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1) + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + flatten_pred_coeffs = torch.cat(flatten_pred_coeffs, dim=1) + + assigned_result = self.assigner( + (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype), + flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + assigned_gt_idxs = assigned_result['assigned_gt_idxs'] + + assigned_scores_sum = assigned_scores.sum().clamp(min=1) + + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores).sum() + loss_cls /= assigned_scores_sum + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select(assigned_scores.sum(-1), + fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, assigned_bboxes_pos, + weight=bbox_weight) / assigned_scores_sum + + # dfl loss + pred_dist_pos = flatten_dist_preds[fg_mask_pre_prior] + assigned_ltrb = self.bbox_coder.encode( + self.flatten_priors_train[..., :2] / self.stride_tensor, + assigned_bboxes, + max_dis=self.head_module.reg_max - 1, + eps=0.01) + assigned_ltrb_pos = torch.masked_select( + assigned_ltrb, prior_bbox_mask).reshape([-1, 4]) + loss_dfl = self.loss_dfl(pred_dist_pos.reshape( + -1, self.head_module.reg_max), + assigned_ltrb_pos.reshape(-1), + weight=bbox_weight.expand(-1, + 4).reshape(-1), + avg_factor=assigned_scores_sum) + + _, c, mask_h, mask_w = proto_preds.shape + if batch_gt_masks.shape[-2:] != (mask_h, mask_w): + batch_gt_masks = F.interpolate(batch_gt_masks[None], + (mask_h, mask_w), + mode='nearest')[0] + + loss_mask = torch.zeros(1, device=loss_dfl.device) + box_sum_flag = pad_bbox_flag.long().sum(dim=1).squeeze(1) + + batch_inds = torch.zeros(num_imgs, + dtype=torch.int64, + device=assigned_gt_idxs.device)[:, None] + batch_inds[1:] = box_sum_flag.cumsum(dim=0)[:-1][..., None] + _assigned_gt_idxs = assigned_gt_idxs + batch_inds + + for bs in range(num_imgs): + # 8400 + bbox_match_inds = assigned_gt_idxs[bs] + mask_match_inds = _assigned_gt_idxs[bs] + + bbox_match_inds = torch.masked_select(bbox_match_inds, + fg_mask_pre_prior[bs]) + mask_match_inds = torch.masked_select(mask_match_inds, + fg_mask_pre_prior[bs]) + + # mask + mask_dim = coeff_preds[0].shape[1] + prior_mask_mask = fg_mask_pre_prior[bs].unsqueeze(-1).repeat( + [1, mask_dim]) + pred_coeffs_pos = torch.masked_select(flatten_pred_coeffs[bs], + prior_mask_mask).reshape( + [-1, mask_dim]) + + match_boxes = gt_bboxes[bs][bbox_match_inds] / 4 + normed_boxes = gt_bboxes[bs][bbox_match_inds] / 640 + + bbox_area = (normed_boxes[:, 2:] - + normed_boxes[:, :2]).prod(dim=1) + if not mask_match_inds.any(): + continue + assert not self.mask_overlap + mask_gti = batch_gt_masks[mask_match_inds] + mask_preds = ( + pred_coeffs_pos @ proto_preds[bs].view(c, -1)).view( + -1, mask_h, mask_w) + loss_mask_full = self.loss_mask(mask_preds, mask_gti) + _loss_mask = (self.crop_mask(loss_mask_full[None], + match_boxes).mean(dim=(2, 3)) / + bbox_area) + + loss_mask += _loss_mask.mean() + + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + loss_dfl = flatten_pred_bboxes.sum() * 0 + loss_mask = flatten_pred_coeffs.sum() * 0 + _, world_size = get_dist_info() + + return dict(loss_cls=loss_cls * num_imgs * world_size, + loss_bbox=loss_bbox * num_imgs * world_size, + loss_dfl=loss_dfl * num_imgs * world_size, + loss_mask=loss_mask * self.loss_mask_weight * world_size) diff --git a/models/YOLO-World/yolo_world/models/detectors/__init__.py b/models/YOLO-World/yolo_world/models/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..991a2e4284135d4706865a4e9371b83989a4183e --- /dev/null +++ b/models/YOLO-World/yolo_world/models/detectors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .yolo_world import YOLOWorldDetector, SimpleYOLOWorldDetector + +__all__ = ['YOLOWorldDetector', 'SimpleYOLOWorldDetector'] diff --git a/models/YOLO-World/yolo_world/models/detectors/yolo_world.py b/models/YOLO-World/yolo_world/models/detectors/yolo_world.py new file mode 100644 index 0000000000000000000000000000000000000000..6d4b09103dabade430e0c13b28e94b509ebb552e --- /dev/null +++ b/models/YOLO-World/yolo_world/models/detectors/yolo_world.py @@ -0,0 +1,231 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from typing import List, Tuple, Union +import torch +import torch.nn as nn +from torch import Tensor +from mmdet.structures import OptSampleList, SampleList +from mmyolo.models.detectors import YOLODetector +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLOWorldDetector(YOLODetector): + """Implementation of YOLOW Series""" + + def __init__(self, + *args, + mm_neck: bool = False, + num_train_classes=80, + num_test_classes=80, + **kwargs) -> None: + self.mm_neck = mm_neck + self.num_train_classes = num_train_classes + self.num_test_classes = num_test_classes + super().__init__(*args, **kwargs) + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, list]: + """Calculate losses from a batch of inputs and data samples.""" + self.bbox_head.num_classes = self.num_train_classes + img_feats, txt_feats = self.extract_feat(batch_inputs, + batch_data_samples) + losses = self.bbox_head.loss(img_feats, txt_feats, batch_data_samples) + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + """ + + img_feats, txt_feats = self.extract_feat(batch_inputs, + batch_data_samples) + + # self.bbox_head.num_classes = self.num_test_classes + self.bbox_head.num_classes = txt_feats[0].shape[0] + results_list = self.bbox_head.predict(img_feats, + txt_feats, + batch_data_samples, + rescale=rescale) + + batch_data_samples = self.add_pred_to_datasample( + batch_data_samples, results_list) + return batch_data_samples + + def reparameterize(self, texts: List[List[str]]) -> None: + # encode text embeddings into the detector + self.texts = texts + self.text_feats = self.backbone.forward_text(texts) + + def _forward( + self, + batch_inputs: Tensor, + batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + """ + img_feats, txt_feats = self.extract_feat(batch_inputs, + batch_data_samples) + results = self.bbox_head.forward(img_feats, txt_feats) + return results + + def extract_feat( + self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Tuple[Tuple[Tensor], Tensor]: + """Extract features.""" + txt_feats = None + if batch_data_samples is None: + texts = self.texts + txt_feats = self.text_feats + elif isinstance(batch_data_samples, + dict) and 'texts' in batch_data_samples: + texts = batch_data_samples['texts'] + elif isinstance(batch_data_samples, list) and hasattr( + batch_data_samples[0], 'texts'): + texts = [data_sample.texts for data_sample in batch_data_samples] + elif hasattr(self, 'text_feats'): + texts = self.texts + txt_feats = self.text_feats + else: + raise TypeError('batch_data_samples should be dict or list.') + if txt_feats is not None: + # forward image only + img_feats = self.backbone.forward_image(batch_inputs) + else: + img_feats, txt_feats = self.backbone(batch_inputs, texts) + if self.with_neck: + if self.mm_neck: + img_feats = self.neck(img_feats, txt_feats) + else: + img_feats = self.neck(img_feats) + return img_feats, txt_feats + + +@MODELS.register_module() +class SimpleYOLOWorldDetector(YOLODetector): + """Implementation of YOLO World Series""" + + def __init__(self, + *args, + mm_neck: bool = False, + num_train_classes=80, + num_test_classes=80, + prompt_dim=512, + num_prompts=80, + embedding_path='', + reparameterized=False, + freeze_prompt=False, + use_mlp_adapter=False, + **kwargs) -> None: + self.mm_neck = mm_neck + self.num_training_classes = num_train_classes + self.num_test_classes = num_test_classes + self.prompt_dim = prompt_dim + self.num_prompts = num_prompts + self.reparameterized = reparameterized + self.freeze_prompt = freeze_prompt + self.use_mlp_adapter = use_mlp_adapter + super().__init__(*args, **kwargs) + + if not self.reparameterized: + if len(embedding_path) > 0: + import numpy as np + self.embeddings = torch.nn.Parameter( + torch.from_numpy(np.load(embedding_path)).float()) + else: + # random init + embeddings = nn.functional.normalize(torch.randn( + (num_prompts, prompt_dim)), + dim=-1) + self.embeddings = nn.Parameter(embeddings) + + if self.freeze_prompt: + self.embeddings.requires_grad = False + else: + self.embeddings.requires_grad = True + + if use_mlp_adapter: + self.adapter = nn.Sequential( + nn.Linear(prompt_dim, prompt_dim * 2), nn.ReLU(True), + nn.Linear(prompt_dim * 2, prompt_dim)) + else: + self.adapter = None + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, list]: + """Calculate losses from a batch of inputs and data samples.""" + self.bbox_head.num_classes = self.num_training_classes + img_feats, txt_feats = self.extract_feat(batch_inputs, + batch_data_samples) + if self.reparameterized: + losses = self.bbox_head.loss(img_feats, batch_data_samples) + else: + losses = self.bbox_head.loss(img_feats, txt_feats, + batch_data_samples) + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + """ + + img_feats, txt_feats = self.extract_feat(batch_inputs, + batch_data_samples) + + self.bbox_head.num_classes = self.num_test_classes + if self.reparameterized: + results_list = self.bbox_head.predict(img_feats, + batch_data_samples, + rescale=rescale) + else: + results_list = self.bbox_head.predict(img_feats, + txt_feats, + batch_data_samples, + rescale=rescale) + + batch_data_samples = self.add_pred_to_datasample( + batch_data_samples, results_list) + return batch_data_samples + + def _forward( + self, + batch_inputs: Tensor, + batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + """ + img_feats, txt_feats = self.extract_feat(batch_inputs, + batch_data_samples) + if self.reparameterized: + results = self.bbox_head.forward(img_feats) + else: + results = self.bbox_head.forward(img_feats, txt_feats) + return results + + def extract_feat( + self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Tuple[Tuple[Tensor], Tensor]: + """Extract features.""" + # only image features + img_feats, _ = self.backbone(batch_inputs, None) + + if not self.reparameterized: + # use embeddings + txt_feats = self.embeddings[None] + if self.adapter is not None: + txt_feats = self.adapter(txt_feats) + txt_feats + txt_feats = nn.functional.normalize(txt_feats, dim=-1, p=2) + txt_feats = txt_feats.repeat(img_feats[0].shape[0], 1, 1) + else: + txt_feats = None + if self.with_neck: + if self.mm_neck: + img_feats = self.neck(img_feats, txt_feats) + else: + img_feats = self.neck(img_feats) + return img_feats, txt_feats diff --git a/models/YOLO-World/yolo_world/models/layers/__init__.py b/models/YOLO-World/yolo_world/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..830dc3bd45a7bd3a214b1c40f80b8d55fabc2195 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/layers/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) Tencent Inc. All rights reserved. +# Basic brick modules for PAFPN based on CSPLayers + +from .yolo_bricks import ( + CSPLayerWithTwoConv, + MaxSigmoidAttnBlock, + MaxSigmoidCSPLayerWithTwoConv, + ImagePoolingAttentionModule, + RepConvMaxSigmoidCSPLayerWithTwoConv, + RepMaxSigmoidCSPLayerWithTwoConv + ) + +__all__ = ['CSPLayerWithTwoConv', + 'MaxSigmoidAttnBlock', + 'MaxSigmoidCSPLayerWithTwoConv', + 'RepConvMaxSigmoidCSPLayerWithTwoConv', + 'RepMaxSigmoidCSPLayerWithTwoConv', + 'ImagePoolingAttentionModule'] diff --git a/models/YOLO-World/yolo_world/models/layers/yolo_bricks.py b/models/YOLO-World/yolo_world/models/layers/yolo_bricks.py new file mode 100644 index 0000000000000000000000000000000000000000..0c39131cfda2de942bfd3fa9f894870b8664f377 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/layers/yolo_bricks.py @@ -0,0 +1,601 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from typing import List + +import torch +import torch.nn as nn +from torch import Tensor +import torch.nn.functional as F +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Linear +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from mmengine.model import BaseModule +from mmyolo.registry import MODELS +from mmyolo.models.layers import CSPLayerWithTwoConv + + +@MODELS.register_module() +class MaxSigmoidAttnBlock(BaseModule): + """Max Sigmoid attention block.""" + + def __init__(self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + kernel_size: int = 3, + padding: int = 1, + num_heads: int = 1, + use_depthwise: bool = False, + with_scale: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + init_cfg: OptMultiConfig = None, + use_einsum: bool = True) -> None: + super().__init__(init_cfg=init_cfg) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + assert (out_channels % num_heads == 0 and + embed_channels % num_heads == 0), \ + 'out_channels and embed_channels should be divisible by num_heads.' + self.num_heads = num_heads + self.head_channels = out_channels // num_heads + self.use_einsum = use_einsum + + self.embed_conv = ConvModule( + in_channels, + embed_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) if embed_channels != in_channels else None + self.guide_fc = Linear(guide_channels, embed_channels) + self.bias = nn.Parameter(torch.zeros(num_heads)) + if with_scale: + self.scale = nn.Parameter(torch.ones(1, num_heads, 1, 1)) + else: + self.scale = 1.0 + + self.project_conv = conv(in_channels, + out_channels, + kernel_size, + stride=1, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + B, _, H, W = x.shape + + guide = self.guide_fc(guide) + guide = guide.reshape(B, -1, self.num_heads, self.head_channels) + embed = self.embed_conv(x) if self.embed_conv is not None else x + embed = embed.reshape(B, self.num_heads, self.head_channels, H, W) + + if self.use_einsum: + attn_weight = torch.einsum('bmchw,bnmc->bmhwn', embed, guide) + else: + batch, m, channel, height, width = embed.shape + _, n, _, _ = guide.shape + embed = embed.permute(0, 1, 3, 4, 2) + embed = embed.reshape(batch, m, -1, channel) + guide = guide.permute(0, 2, 3, 1) + attn_weight = torch.matmul(embed, guide) + attn_weight = attn_weight.reshape(batch, m, height, width, n) + + attn_weight = attn_weight.max(dim=-1)[0] + attn_weight = attn_weight / (self.head_channels**0.5) + attn_weight = attn_weight + self.bias[None, :, None, None] + attn_weight = attn_weight.sigmoid() * self.scale + + x = self.project_conv(x) + x = x.reshape(B, self.num_heads, -1, H, W) + x = x * attn_weight.unsqueeze(2) + x = x.reshape(B, -1, H, W) + return x + + +@MODELS.register_module() +class RepMatrixMaxSigmoidAttnBlock(BaseModule): + """Max Sigmoid attention block.""" + + def __init__(self, + in_channels: int, + out_channels: int, + embed_channels: int, + guide_channels: int, + kernel_size: int = 3, + padding: int = 1, + num_heads: int = 1, + use_depthwise: bool = False, + with_scale: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + init_cfg: OptMultiConfig = None, + use_einsum: bool = True) -> None: + super().__init__(init_cfg=init_cfg) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + assert (out_channels % num_heads == 0 and + embed_channels % num_heads == 0), \ + 'out_channels and embed_channels should be divisible by num_heads.' + self.num_heads = num_heads + self.head_channels = out_channels // num_heads + self.use_einsum = use_einsum + + self.embed_conv = ConvModule( + in_channels, + embed_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) if embed_channels != in_channels else None + self.bias = nn.Parameter(torch.zeros(num_heads)) + self.guide_weight = nn.Parameter( + torch.zeros(guide_channels, embed_channels // num_heads, + num_heads)) + self.project_conv = conv(in_channels, + out_channels, + kernel_size, + stride=1, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x: Tensor, txt_feats: Tensor = None) -> Tensor: + """Forward process.""" + B, _, H, W = x.shape + + embed = self.embed_conv(x) if self.embed_conv is not None else x + embed = embed.reshape(B, self.num_heads, self.head_channels, H, W) + + batch, m, channel, height, width = embed.shape + _, n, _, _ = self.guide_weight.shape + # can be formulated to split conv + embed = embed.permute(0, 1, 3, 4, 2) + embed = embed.reshape(batch, m, -1, channel) + attn_weight = torch.matmul(embed, self.guide_weight) + attn_weight = attn_weight.reshape(batch, m, height, width, n) + + attn_weight = attn_weight.max(dim=-1)[0] + attn_weight = attn_weight / (self.head_channels**0.5) + attn_weight = attn_weight + self.bias[None, :, None, None] + attn_weight = attn_weight.sigmoid() + + x = self.project_conv(x) + x = x.reshape(B, self.num_heads, -1, H, W) + x = x * attn_weight.unsqueeze(2) + x = x.reshape(B, -1, H, W) + return x + + +@MODELS.register_module() +class RepConvMaxSigmoidAttnBlock(BaseModule): + """Max Sigmoid attention block.""" + + def __init__(self, + in_channels: int, + out_channels: int, + embed_channels: int, + guide_channels: int, + kernel_size: int = 3, + padding: int = 1, + num_heads: int = 1, + use_depthwise: bool = False, + with_scale: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + init_cfg: OptMultiConfig = None, + use_einsum: bool = True) -> None: + super().__init__(init_cfg=init_cfg) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + assert (out_channels % num_heads == 0 and + embed_channels % num_heads == 0), \ + 'out_channels and embed_channels should be divisible by num_heads.' + self.num_heads = num_heads + self.head_channels = out_channels // num_heads + self.use_einsum = use_einsum + + self.embed_conv = ConvModule( + in_channels, + embed_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) if embed_channels != in_channels else None + self.bias = nn.Parameter(torch.zeros(num_heads)) + self.num_heads = num_heads + self.split_channels = embed_channels // num_heads + self.guide_convs = nn.ModuleList( + nn.Conv2d(self.split_channels, guide_channels, 1, bias=False) + for _ in range(num_heads)) + self.project_conv = conv(in_channels, + out_channels, + kernel_size, + stride=1, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x: Tensor, txt_feats: Tensor = None) -> Tensor: + """Forward process.""" + B, C, H, W = x.shape + + embed = self.embed_conv(x) if self.embed_conv is not None else x + embed = list(embed.split(self.split_channels, 1)) + # Bx(MxN)xHxW (H*c=C, H: heads) + attn_weight = torch.cat( + [conv(x) for conv, x in zip(self.guide_convs, embed)], dim=1) + # BxMxNxHxW + attn_weight = attn_weight.view(B, self.num_heads, -1, H, W) + # attn_weight = torch.stack( + # [conv(x) for conv, x in zip(self.guide_convs, embed)]) + # BxMxNxHxW -> BxMxHxW + attn_weight = attn_weight.max(dim=2)[0] / (self.head_channels**0.5) + attn_weight = (attn_weight + self.bias.view(1, -1, 1, 1)).sigmoid() + # .transpose(0, 1) + # BxMx1xHxW + attn_weight = attn_weight[:, :, None] + x = self.project_conv(x) + # BxHxCxHxW + x = x.view(B, self.num_heads, -1, H, W) + x = x * attn_weight + x = x.view(B, -1, H, W) + return x + + +@MODELS.register_module() +class MaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv): + """Sigmoid-attention based CSP layer with two convolution layers.""" + + def __init__( + self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + num_heads: int = 1, + expand_ratio: float = 0.5, + num_blocks: int = 1, + with_scale: bool = False, + add_identity: bool = True, # shortcut + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + use_einsum: bool = True) -> None: + super().__init__(in_channels=in_channels, + out_channels=out_channels, + expand_ratio=expand_ratio, + num_blocks=num_blocks, + add_identity=add_identity, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.attn_block = MaxSigmoidAttnBlock(self.mid_channels, + self.mid_channels, + guide_channels=guide_channels, + embed_channels=embed_channels, + num_heads=num_heads, + with_scale=with_scale, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + use_einsum=use_einsum) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + x_main = self.main_conv(x) + x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1)) + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.append(self.attn_block(x_main[-1], guide)) + return self.final_conv(torch.cat(x_main, 1)) + + +@MODELS.register_module() +class RepMaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv): + """Sigmoid-attention based CSP layer with two convolution layers.""" + + def __init__( + self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + num_heads: int = 1, + expand_ratio: float = 0.5, + num_blocks: int = 1, + with_scale: bool = False, + add_identity: bool = True, # shortcut + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + use_einsum: bool = True) -> None: + super().__init__(in_channels=in_channels, + out_channels=out_channels, + expand_ratio=expand_ratio, + num_blocks=num_blocks, + add_identity=add_identity, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.attn_block = RepMatrixMaxSigmoidAttnBlock( + self.mid_channels, + self.mid_channels, + embed_channels=embed_channels, + guide_channels=guide_channels, + num_heads=num_heads, + with_scale=with_scale, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + use_einsum=use_einsum) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + x_main = self.main_conv(x) + x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1)) + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.append(self.attn_block(x_main[-1], guide)) + return self.final_conv(torch.cat(x_main, 1)) + + +@MODELS.register_module() +class RepConvMaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv): + """Sigmoid-attention based CSP layer with two convolution layers.""" + + def __init__( + self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + num_heads: int = 1, + expand_ratio: float = 0.5, + num_blocks: int = 1, + with_scale: bool = False, + add_identity: bool = True, # shortcut + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + use_einsum: bool = True) -> None: + super().__init__(in_channels=in_channels, + out_channels=out_channels, + expand_ratio=expand_ratio, + num_blocks=num_blocks, + add_identity=add_identity, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.attn_block = RepConvMaxSigmoidAttnBlock( + self.mid_channels, + self.mid_channels, + embed_channels=embed_channels, + guide_channels=guide_channels, + num_heads=num_heads, + with_scale=with_scale, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + use_einsum=use_einsum) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + x_main = self.main_conv(x) + x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1)) + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.append(self.attn_block(x_main[-1], guide)) + return self.final_conv(torch.cat(x_main, 1)) + + +@MODELS.register_module() +class ImagePoolingAttentionModule(nn.Module): + + def __init__(self, + image_channels: List[int], + text_channels: int, + embed_channels: int, + with_scale: bool = False, + num_feats: int = 3, + num_heads: int = 8, + pool_size: int = 3, + use_einsum: bool = True): + super().__init__() + + self.text_channels = text_channels + self.embed_channels = embed_channels + self.num_heads = num_heads + self.num_feats = num_feats + self.head_channels = embed_channels // num_heads + self.pool_size = pool_size + self.use_einsum = use_einsum + if with_scale: + self.scale = nn.Parameter(torch.tensor([0.]), requires_grad=True) + else: + self.scale = 1.0 + self.projections = nn.ModuleList([ + ConvModule(in_channels, embed_channels, 1, act_cfg=None) + for in_channels in image_channels + ]) + self.query = nn.Sequential(nn.LayerNorm(text_channels), + Linear(text_channels, embed_channels)) + self.key = nn.Sequential(nn.LayerNorm(embed_channels), + Linear(embed_channels, embed_channels)) + self.value = nn.Sequential(nn.LayerNorm(embed_channels), + Linear(embed_channels, embed_channels)) + self.proj = Linear(embed_channels, text_channels) + + self.image_pools = nn.ModuleList([ + nn.AdaptiveMaxPool2d((pool_size, pool_size)) + for _ in range(num_feats) + ]) + + def forward(self, text_features, image_features): + B = image_features[0].shape[0] + assert len(image_features) == self.num_feats + num_patches = self.pool_size**2 + mlvl_image_features = [ + pool(proj(x)).view(B, -1, num_patches) + for (x, proj, pool + ) in zip(image_features, self.projections, self.image_pools) + ] + mlvl_image_features = torch.cat(mlvl_image_features, + dim=-1).transpose(1, 2) + q = self.query(text_features) + k = self.key(mlvl_image_features) + v = self.value(mlvl_image_features) + + q = q.reshape(B, -1, self.num_heads, self.head_channels) + k = k.reshape(B, -1, self.num_heads, self.head_channels) + v = v.reshape(B, -1, self.num_heads, self.head_channels) + if self.use_einsum: + attn_weight = torch.einsum('bnmc,bkmc->bmnk', q, k) + else: + q = q.permute(0, 2, 1, 3) + k = k.permute(0, 2, 3, 1) + attn_weight = torch.matmul(q, k) + + attn_weight = attn_weight / (self.head_channels**0.5) + attn_weight = F.softmax(attn_weight, dim=-1) + if self.use_einsum: + x = torch.einsum('bmnk,bkmc->bnmc', attn_weight, v) + else: + v = v.permute(0, 2, 1, 3) + x = torch.matmul(attn_weight, v) + x = x.permute(0, 2, 1, 3) + x = self.proj(x.reshape(B, -1, self.embed_channels)) + return x * self.scale + text_features + + +@MODELS.register_module() +class VanillaSigmoidBlock(BaseModule): + """Sigmoid attention block.""" + + def __init__(self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + kernel_size: int = 3, + padding: int = 1, + num_heads: int = 1, + use_depthwise: bool = False, + with_scale: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + assert (out_channels % num_heads == 0 and + embed_channels % num_heads == 0), \ + 'out_channels and embed_channels should be divisible by num_heads.' + self.num_heads = num_heads + self.head_channels = out_channels // num_heads + + self.project_conv = conv(in_channels, + out_channels, + kernel_size, + stride=1, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + x = self.project_conv(x) + # remove sigmoid + # x = x * x.sigmoid() + return x + + +@MODELS.register_module() +class EfficientCSPLayerWithTwoConv(CSPLayerWithTwoConv): + """Sigmoid-attention based CSP layer with two convolution layers.""" + + def __init__( + self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + num_heads: int = 1, + expand_ratio: float = 0.5, + num_blocks: int = 1, + with_scale: bool = False, + add_identity: bool = True, # shortcut + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(in_channels=in_channels, + out_channels=out_channels, + expand_ratio=expand_ratio, + num_blocks=num_blocks, + add_identity=add_identity, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.attn_block = VanillaSigmoidBlock(self.mid_channels, + self.mid_channels, + guide_channels=guide_channels, + embed_channels=embed_channels, + num_heads=num_heads, + with_scale=with_scale, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + x_main = self.main_conv(x) + x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1)) + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.append(self.attn_block(x_main[-1], guide)) + return self.final_conv(torch.cat(x_main, 1)) diff --git a/models/YOLO-World/yolo_world/models/losses/__init__.py b/models/YOLO-World/yolo_world/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8423e30498fa69a08b9d66b492261cbfdec9e4f3 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/losses/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .dynamic_loss import CoVMSELoss + +__all__ = ['CoVMSELoss'] diff --git a/models/YOLO-World/yolo_world/models/losses/dynamic_loss.py b/models/YOLO-World/yolo_world/models/losses/dynamic_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..30d56c3afd8ef22867bf5cba919e4a2844577688 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/losses/dynamic_loss.py @@ -0,0 +1,38 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from typing import Optional + +import torch +import torch.nn as nn +from torch import Tensor +from mmdet.models.losses.mse_loss import mse_loss +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class CoVMSELoss(nn.Module): + + def __init__(self, + dim: int = 0, + reduction: str = 'mean', + loss_weight: float = 1.0, + eps: float = 1e-6) -> None: + super().__init__() + self.dim = dim + self.reduction = reduction + self.loss_weight = loss_weight + self.eps = eps + + def forward(self, + pred: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None) -> Tensor: + """Forward function of loss.""" + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + cov = pred.std(self.dim) / pred.mean(self.dim).clamp(min=self.eps) + target = torch.zeros_like(cov) + loss = self.loss_weight * mse_loss( + cov, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss diff --git a/models/YOLO-World/yolo_world/models/necks/__init__.py b/models/YOLO-World/yolo_world/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4d5219c66b5805bcb5e54bc0506dd19c4d8753ab --- /dev/null +++ b/models/YOLO-World/yolo_world/models/necks/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .yolo_world_pafpn import YOLOWorldPAFPN, YOLOWorldDualPAFPN + +__all__ = ['YOLOWorldPAFPN', 'YOLOWorldDualPAFPN'] diff --git a/models/YOLO-World/yolo_world/models/necks/yolo_world_pafpn.py b/models/YOLO-World/yolo_world/models/necks/yolo_world_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..35d04cd5b5b13abecf23718ad6532bf7050b00f6 --- /dev/null +++ b/models/YOLO-World/yolo_world/models/necks/yolo_world_pafpn.py @@ -0,0 +1,235 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import copy +from typing import List, Union + +import torch +import torch.nn as nn +from torch import Tensor +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from mmyolo.models.utils import make_divisible, make_round +from mmyolo.models.necks.yolov8_pafpn import YOLOv8PAFPN + + +@MODELS.register_module() +class YOLOWorldPAFPN(YOLOv8PAFPN): + """Path Aggregation Network used in YOLO World + Following YOLOv8 PAFPN, including text to image fusion + """ + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + guide_channels: int, + embed_channels: List[int], + num_heads: List[int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'), + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + self.guide_channels = guide_channels + self.embed_channels = embed_channels + self.num_heads = num_heads + self.block_cfg = block_cfg + super().__init__(in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + block_cfg = copy.deepcopy(self.block_cfg) + block_cfg.update( + dict(in_channels=make_divisible( + (self.in_channels[idx - 1] + self.in_channels[idx]), + self.widen_factor), + out_channels=make_divisible(self.out_channels[idx - 1], + self.widen_factor), + guide_channels=self.guide_channels, + embed_channels=make_round(self.embed_channels[idx - 1], + self.widen_factor), + num_heads=make_round(self.num_heads[idx - 1], + self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, + self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return MODELS.build(block_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = copy.deepcopy(self.block_cfg) + block_cfg.update( + dict(in_channels=make_divisible( + (self.out_channels[idx] + self.out_channels[idx + 1]), + self.widen_factor), + out_channels=make_divisible(self.out_channels[idx + 1], + self.widen_factor), + guide_channels=self.guide_channels, + embed_channels=make_round(self.embed_channels[idx + 1], + self.widen_factor), + num_heads=make_round(self.num_heads[idx + 1], + self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, + self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return MODELS.build(block_cfg) + + def forward(self, img_feats: List[Tensor], txt_feats: Tensor = None) -> tuple: + """Forward function. + including multi-level image features, text features: BxLxD + """ + assert len(img_feats) == len(self.in_channels) + # reduce layers + reduce_outs = [] + for idx in range(len(self.in_channels)): + reduce_outs.append(self.reduce_layers[idx](img_feats[idx])) + + # top-down path + inner_outs = [reduce_outs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_low = reduce_outs[idx - 1] + upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - + idx](feat_high) + if self.upsample_feats_cat_first: + top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1) + else: + top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1) + inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx]( + top_down_layer_inputs, txt_feats) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_layers[idx](feat_low) + out = self.bottom_up_layers[idx](torch.cat( + [downsample_feat, feat_high], 1), txt_feats) + outs.append(out) + + # out_layers + results = [] + for idx in range(len(self.in_channels)): + results.append(self.out_layers[idx](outs[idx])) + + return tuple(results) + + +@MODELS.register_module() +class YOLOWorldDualPAFPN(YOLOWorldPAFPN): + """Path Aggregation Network used in YOLO World v8.""" + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + guide_channels: int, + embed_channels: List[int], + num_heads: List[int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + text_enhancder: ConfigType = dict( + type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8, + pool_size=3), + block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'), + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(in_channels=in_channels, + out_channels=out_channels, + guide_channels=guide_channels, + embed_channels=embed_channels, + num_heads=num_heads, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + block_cfg=block_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + text_enhancder.update( + dict( + image_channels=[int(x * widen_factor) for x in out_channels], + text_channels=guide_channels, + num_feats=len(out_channels), + )) + print(text_enhancder) + self.text_enhancer = MODELS.build(text_enhancder) + + def forward(self, img_feats: List[Tensor], txt_feats: Tensor) -> tuple: + """Forward function.""" + assert len(img_feats) == len(self.in_channels) + # reduce layers + reduce_outs = [] + for idx in range(len(self.in_channels)): + reduce_outs.append(self.reduce_layers[idx](img_feats[idx])) + + # top-down path + inner_outs = [reduce_outs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_low = reduce_outs[idx - 1] + upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - + idx](feat_high) + if self.upsample_feats_cat_first: + top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1) + else: + top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1) + inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx]( + top_down_layer_inputs, txt_feats) + inner_outs.insert(0, inner_out) + + txt_feats = self.text_enhancer(txt_feats, inner_outs) + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_layers[idx](feat_low) + out = self.bottom_up_layers[idx](torch.cat( + [downsample_feat, feat_high], 1), txt_feats) + outs.append(out) + + # out_layers + results = [] + for idx in range(len(self.in_channels)): + results.append(self.out_layers[idx](outs[idx])) + + return tuple(results) diff --git a/models/YOLO-World/yolo_world/version.py b/models/YOLO-World/yolo_world/version.py new file mode 100644 index 0000000000000000000000000000000000000000..8f340586751e4b37195b1bbd1ee2a5b237561ee2 --- /dev/null +++ b/models/YOLO-World/yolo_world/version.py @@ -0,0 +1,23 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from yolo_world import __version__ + +def __version_info() -> tuple: + """Parse a version string into a tuple. + Returns: + tuple[int | str]: The version info, e.g., "1.3.0" is parsed into + (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1'). + """ + version_info = [] + for x in __version__.split('.'): + if x.isdigit(): + version_info.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + version_info.append(int(patch_version[0])) + version_info.append(f'rc{patch_version[1]}') + return tuple(version_info) + + +version_info = __version_info() + +__all__ = ['__version__', 'version_info'] diff --git a/pretrained/config.yaml b/pretrained/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c4f9e5d087bd625a214d2292d3f5f466e5bd9d87 --- /dev/null +++ b/pretrained/config.yaml @@ -0,0 +1,20 @@ +openyolo3d: + frequency: 10 + vis_depth_threshold: 0.05 + depth_scale: 1000.0 + topk: 25 + topk_per_image: -1 + +network2d: + text_prompts: ["chair"] + topk: 100 + th: 0.08 + nms: 0.3 + use_amp: False + pretrained_path: "pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth" + config_path: "pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py" + +network3d: + pretrained_path: "pretrained/checkpoints/scannet200_val.ckpt" + th: 0.04 + nms: 0.6 \ No newline at end of file diff --git a/pretrained/config_replica.yaml b/pretrained/config_replica.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5609d9cdf754db9f9066d0d59f9cb66683236125 --- /dev/null +++ b/pretrained/config_replica.yaml @@ -0,0 +1,21 @@ +openyolo3d: + frequency: 1 + vis_depth_threshold: 0.4 + depth_scale: 6553.5 + topk: 40 + topk_per_image: -1 + +network2d: + text_prompts: ["basket", "bed", "bench", "bin", "blanket", "blinds", "book", "bottle", "box", "bowl", "camera", "cabinet", "candle", "chair", "clock", "cloth", "comforter", "cushion", "desk", "desk-organizer", "door", "indoor-plant", "lamp", "monitor", "nightstand", "panel", "picture", "pillar", "pillow", "pipe", "plant-stand", "plate", "pot", "sculpture", "shelf", "sofa", "stool", "switch", "table", "tablet", "tissue-paper", "tv-screen", "tv-stand", "vase", "vent", "wall-plug", "window", "rug"] + topk: 100 + th: 0.1 + nms: 0.3 + use_amp: False + pretrained_path: "pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth" + config_path: "pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py" + +network3d: + pretrained_path: "pretrained/checkpoints/scannet200_val.ckpt" + th: 0.02 + nms: 0.1 + is_gt: False diff --git a/pretrained/config_scannet200.yaml b/pretrained/config_scannet200.yaml new file mode 100644 index 0000000000000000000000000000000000000000..da67eed0c095984ec830840ebb3b5498743cb68e --- /dev/null +++ b/pretrained/config_scannet200.yaml @@ -0,0 +1,21 @@ +openyolo3d: + frequency: 10 + vis_depth_threshold: 0.05 + depth_scale: 1000.0 + topk: 40 + topk_per_image: 600 + +network2d: + text_prompts: ['chair', 'table', 'door', 'couch', 'cabinet', 'shelf', 'desk', 'office chair', 'bed', 'pillow', 'sink', 'picture', 'window', 'toilet', 'bookshelf', 'monitor', 'curtain', 'book', 'armchair', 'coffee table', 'box', 'refrigerator', 'lamp', 'kitchen cabinet', 'towel', 'clothes', 'tv', 'nightstand', 'counter', 'dresser', 'stool', 'cushion', 'plant', 'ceiling', 'bathtub', 'end table', 'dining table', 'keyboard', 'bag', 'backpack', 'toilet paper', 'printer', 'tv stand', 'whiteboard', 'blanket', 'shower curtain', 'trash can', 'closet', 'stairs', 'microwave', 'stove', 'shoe', 'computer tower', 'bottle', 'bin', 'ottoman', 'bench', 'board', 'washing machine', 'mirror', 'copier', 'basket', 'sofa chair', 'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person', 'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate', 'blackboard', 'piano', 'suitcase', 'rail', 'radiator', 'recycling bin', 'container', 'wardrobe', 'soap dispenser', 'telephone', 'bucket', 'clock', 'stand', 'light', 'laundry basket', 'pipe', 'clothes dryer', 'guitar', 'toilet paper holder', 'seat', 'speaker', 'column', 'bicycle', 'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket', 'storage bin', 'coffee maker', 'dishwasher', 'paper towel roll', 'machine', 'mat', 'windowsill', 'bar', 'toaster', 'bulletin board', 'ironing board', 'fireplace', 'soap dish', 'kitchen counter', 'doorframe', 'toilet paper dispenser', 'mini fridge', 'fire extinguisher', 'ball', 'hat', 'shower curtain rod', 'water cooler', 'paper cutter', 'tray', 'shower door', 'pillar', 'ledge', 'toaster oven', 'mouse', 'toilet seat cover dispenser', 'furniture', 'cart', 'storage container', 'scale', 'tissue box', 'light switch', 'crate', 'power outlet', 'decoration', 'sign', 'projector', 'closet door', 'vacuum cleaner', 'candle', 'plunger', 'stuffed animal', 'headphones', 'dish rack', 'broom', 'guitar case', 'range hood', 'dustpan', 'hair dryer', 'water bottle', 'handicap bar', 'purse', 'vent', 'shower floor', 'water pitcher', 'mailbox', 'bowl', 'paper bag', 'alarm clock', 'music stand', 'projector screen', 'divider', 'laundry detergent', 'bathroom counter', 'object', 'bathroom vanity', 'closet wall', 'laundry hamper', 'bathroom stall door', 'ceiling light', 'trash bin', 'dumbbell', 'stair rail', 'tube', 'bathroom cabinet', 'cd case', 'closet rod', 'coffee kettle', 'structure', 'shower head', 'keyboard piano', 'case of water bottles', 'coat rack', 'storage organizer', 'folded chair', 'fire alarm', 'power strip', 'calendar', 'poster', 'potted plant', 'luggage', 'mattress'] + topk: 100 + th: 0.08 + nms: 0.3 + use_amp: False + pretrained_path: "pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth" + config_path: "pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py" + +network3d: + pretrained_path: "pretrained/checkpoints/scannet200_val.ckpt" + th: 0.04 + nms: 0.6 + is_gt: False \ No newline at end of file diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..2bfc3179deb965e6d62f6ccff4abc12647f298ec --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,171 @@ +import os +_base_ = (f'{os.getcwd()}/models/YOLO-World/third_party/mmyolo/configs/yolov8/' + 'yolov8_x_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + use_bn_head=True, + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/texts/obj365v1_class_texts.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/texts/lvis_v1_class_texts.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134026/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134026/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134026/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134152/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134152/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134152/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134433/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134433/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134433/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134559/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134559/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134559/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134830/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134830/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_134830/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_135416/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_135416/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_135416/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_135651/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_135651/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_135651/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_140039/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_140039/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_140039/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_140423/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_140423/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_140423/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_140609/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_140609/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_140609/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_141113/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_141113/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_141113/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_141414/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_141414/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_141414/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_141806/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_141806/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_141806/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142042/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142042/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142042/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142241/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142241/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142241/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142654/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142654/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142654/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142940/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142940/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_142940/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_143210/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_143210/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_143210/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_152552/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_152552/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_152552/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_153048/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_153048/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_153048/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_153154/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_153154/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_153154/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_155047/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_155047/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_155047/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_155226/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_155226/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_155226/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_155818/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_155818/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_155818/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_160717/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_160717/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_160717/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_163015/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_163015/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_163015/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_181107/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_181107/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_181107/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_190536/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_190536/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_190536/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_212538/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_212538/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240530_212538/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_115852/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_115852/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_115852/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120013/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120013/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120013/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120058/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120058/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120058/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120255/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120255/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120255/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120409/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120409/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_120409/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_132952/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_132952/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_132952/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_133132/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_133132/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240531_133132/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_190525/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_190525/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_190525/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_190700/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_190700/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_190700/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_191828/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_191828/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_191828/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_192721/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_192721/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_192721/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_192803/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_192803/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_192803/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_192936/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_192936/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_192936/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_193150/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_193150/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_193150/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_193257/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_193257/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_193257/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_193823/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_193823/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_193823/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_194509/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_194509/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_194509/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195205/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195205/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195205/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195352/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195352/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195352/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195607/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195607/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195607/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195729/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195729/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195729/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195842/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195842/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_195842/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200002/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200002/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200002/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200106/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200106/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200106/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200429/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200429/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200429/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200613/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200613/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200613/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200712/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200712/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200712/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200814/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200814/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_200814/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_201123/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_201123/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_201123/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_201621/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_201621/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_201621/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_201813/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_201813/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_201813/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_203924/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_203924/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_203924/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_204301/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_204301/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_204301/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_204356/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_204356/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_204356/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_204855/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_204855/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_204855/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_205142/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_205142/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_205142/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_210103/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_210103/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_210103/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_210805/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_210805/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_210805/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_210916/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_210916/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_210916/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211048/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211048/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211048/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211138/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211138/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211138/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211422/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211422/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211422/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211642/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211642/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211642/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211827/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211827/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_211827/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_214018/vis_data/config.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_214018/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/20240601_214018/vis_data/config.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..f8b00f96ad70de81ecc9073c7d67025d43429906 --- /dev/null +++ b/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,1366 @@ +_backend_args = None +_multiscale_resize_transforms = [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), +] +affine_scale = 0.9 +albu_train_transforms = [ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), +] +backend_args = None +base_lr = 0.002 +batch_shapes_cfg = None +close_mosaic_epochs = 2 +coco_val_dataset = dict( + _delete_=True, + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0001, + priority=49, + strict_load=False, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=98, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='mmdet.PipelineSwitchHook'), +] +custom_imports = dict( + allow_failed_imports=False, imports=[ + 'yolo_world', + ]) +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' +deepen_factor = 1.0 +default_hooks = dict( + checkpoint=dict( + interval=2, + max_keep_ckpts=2, + rule='greater', + save_best='auto', + type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict( + lr_factor=0.01, + max_epochs=100, + scheduler_type='linear', + type='YOLOv5ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) +default_scope = 'mmyolo' +env_cfg = dict( + cudnn_benchmark=True, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +flickr_train_dataset = dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +img_scale = ( + 640, + 640, +) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +last_stage_out_channels = 512 +last_transform = [ + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), +] +load_from = '/home/jean/Amine/OpenYolo3D/pretrained/checkpoints/yolo_world_v2_x_obj365v1_goldg_cc3mlite_pretrain_1280ft-14996a36.pth' +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +loss_bbox_weight = 7.5 +loss_cls_weight = 0.5 +loss_dfl_weight = 0.375 +lr_factor = 0.01 +max_aspect_ratio = 100 +max_epochs = 100 +max_keep_ckpts = 2 +mg_train_dataset = dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset') +mixup_prob = 0.15 +model = dict( + backbone=dict( + image_model=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + deepen_factor=1.0, + last_stage_out_channels=512, + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + type='YOLOv8CSPDarknet', + widen_factor=1.25), + text_model=dict( + frozen_modules=[ + 'all', + ], + model_name='openai/clip-vit-base-patch32', + type='HuggingCLIPLanguageBackbone'), + type='MultiModalYOLOBackbone'), + bbox_head=dict( + bbox_coder=dict(type='DistancePointBBoxCoder'), + head_module=dict( + act_cfg=dict(inplace=True, type='SiLU'), + embed_dims=512, + featmap_strides=[ + 8, + 16, + 32, + ], + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_classes=80, + reg_max=16, + type='YOLOWorldHeadModule', + use_bn_head=True, + widen_factor=1.25), + loss_bbox=dict( + bbox_format='xyxy', + iou_mode='ciou', + loss_weight=7.5, + reduction='sum', + return_iou=False, + type='IoULoss'), + loss_cls=dict( + loss_weight=0.5, + reduction='none', + type='mmdet.CrossEntropyLoss', + use_sigmoid=True), + loss_dfl=dict( + loss_weight=0.375, + reduction='mean', + type='mmdet.DistributionFocalLoss'), + prior_generator=dict( + offset=0.5, strides=[ + 8, + 16, + 32, + ], type='mmdet.MlvlPointGenerator'), + type='YOLOWorldHead'), + data_preprocessor=dict( + bgr_to_rgb=True, + mean=[ + 0.0, + 0.0, + 0.0, + ], + std=[ + 255.0, + 255.0, + 255.0, + ], + type='YOLOWDetDataPreprocessor'), + mm_neck=True, + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + deepen_factor=1.0, + embed_channels=[ + 128, + 256, + 256, + ], + guide_channels=512, + in_channels=[ + 256, + 512, + 512, + ], + norm_cfg=dict(eps=0.001, momentum=0.03, type='BN'), + num_csp_blocks=3, + num_heads=[ + 4, + 8, + 8, + ], + out_channels=[ + 256, + 512, + 512, + ], + type='YOLOWorldPAFPN', + widen_factor=1.25), + num_test_classes=1203, + num_train_classes=80, + test_cfg=dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + assigner=dict( + alpha=0.5, + beta=6.0, + eps=1e-09, + num_classes=80, + topk=10, + type='BatchTaskAlignedAssigner', + use_ciou=True)), + type='YOLOWorldDetector') +model_test_cfg = dict( + max_per_img=300, + multi_label=True, + nms=dict(iou_threshold=0.7, type='nms'), + nms_pre=30000, + score_thr=0.001) +mosaic_affine_transform = [ + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='Mosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), +] +neck_embed_channels = [ + 128, + 256, + 256, +] +neck_num_heads = [ + 4, + 8, + 8, +] +norm_cfg = dict(eps=0.001, momentum=0.03, type='BN') +num_classes = 1203 +num_det_layers = 3 +num_training_classes = 80 +obj365v1_train_dataset = dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset') +optim_wrapper = dict( + clip_grad=dict(max_norm=10.0), + constructor='YOLOWv5OptimizerConstructor', + optimizer=dict( + batch_size_per_gpu=16, lr=0.002, type='AdamW', weight_decay=0.025), + paramwise_cfg=dict( + bias_decay_mult=0.0, + custom_keys=dict({ + 'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0) + }), + norm_decay_mult=0.0), + type='OptimWrapper') +param_scheduler = None +persistent_workers = True +pre_transform = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), +] +resume = False +save_epoch_intervals = 2 +strides = [ + 8, + 16, + 32, +] +tal_alpha = 0.5 +tal_beta = 6.0 +tal_topk = 10 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +text_channels = 512 +text_transform = [ + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_ann_file = 'annotations/instances_train2017.json' +train_batch_size_per_gpu = 16 +train_cfg = dict( + dynamic_intervals=[ + ( + 98, + 1, + ), + ], + max_epochs=100, + type='EpochBasedTrainLoop', + val_interval=10) +train_data_prefix = 'train2017/' +train_dataloader = dict( + batch_size=16, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + datasets=[ + dict( + class_text_path='data/texts/obj365v1_class_texts.json', + dataset=dict( + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + data_root='data/objects365v1/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + type='YOLOv5Objects365V1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + dict( + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + data_root='data/flickr/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + dict( + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + data_root='data/mixed_grounding/', + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='YOLOv5MixedGroundingDataset'), + ], + ignore_keys=[ + 'classes', + 'palette', + ], + type='ConcatDataset'), + num_workers=8, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_num_workers = 8 +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + pad_val=114.0, + pre_transform=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + ], + type='MultiModalMosaic'), + dict( + border=( + -320, + -320, + ), + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=True, + pad_val=dict(img=114.0), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict( + border_val=( + 114, + 114, + 114, + ), + max_aspect_ratio=100, + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=( + 0.09999999999999998, + 1.9, + ), + type='YOLOv5RandomAffine'), + dict( + bbox_params=dict( + format='pascal_voc', + label_fields=[ + 'gt_bboxes_labels', + 'gt_ignore_flags', + ], + type='BboxParams'), + keymap=dict(gt_bboxes='bboxes', img='image'), + transforms=[ + dict(p=0.01, type='Blur'), + dict(p=0.01, type='MedianBlur'), + dict(p=0.01, type='ToGray'), + dict(p=0.01, type='CLAHE'), + ], + type='mmdet.Albu'), + dict(type='YOLOv5HSVRandomAug'), + dict(prob=0.5, type='mmdet.RandomFlip'), + dict( + max_num_samples=80, + num_neg_samples=( + 1203, + 1203, + ), + padding_to_max=True, + padding_value='', + type='RandomLoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'flip', + 'flip_direction', + 'texts', + ), + type='mmdet.PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=300, nms=dict(iou_threshold=0.65, type='nms')), + type='mmdet.DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict( + transforms=[ + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 320, + 320, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 320, + 320, + ), + type='LetterResize'), + ], + type='Compose'), + dict( + transforms=[ + dict(scale=( + 960, + 960, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 960, + 960, + ), + type='LetterResize'), + ], + type='Compose'), + ], + [ + dict(prob=1.0, type='mmdet.RandomFlip'), + dict(prob=0.0, type='mmdet.RandomFlip'), + ], + [ + dict(type='mmdet.LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'flip', + 'flip_direction', + ), + type='mmdet.PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_ann_file = 'annotations/instances_val2017.json' +val_batch_size_per_gpu = 1 +val_cfg = dict(type='ValLoop') +val_data_prefix = 'val2017/' +val_dataloader = dict( + batch_size=1, + dataset=dict( + class_text_path='data/texts/lvis_v1_class_texts.json', + dataset=dict( + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + batch_shapes_cfg=None, + data_prefix=dict(img=''), + data_root='data/coco/', + test_mode=True, + type='YOLOv5LVISV1Dataset'), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(scale=( + 640, + 640, + ), type='YOLOv5KeepRatioResize'), + dict( + allow_scale_up=False, + pad_val=dict(img=114), + scale=( + 640, + 640, + ), + type='LetterResize'), + dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True), + dict(type='LoadText'), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'pad_param', + 'texts', + ), + type='mmdet.PackDetInputs'), + ], + type='MultiModalDataset'), + drop_last=False, + num_workers=2, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='mmdet.LVISMetric') +val_interval_stage2 = 1 +val_num_workers = 2 +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='mmdet.DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) +weight_decay = 0.025 +widen_factor = 1.25 +work_dir = '/home/jean/Amine/OpenYolo3D/pretrained/configs/yolo_world_v2_x_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival' diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..357965670d85ef060f4c520fd1fac298d0dec549 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,448 @@ +from utils.utils_3d import Network_3D +from utils.utils_2d import Network_2D, load_yaml +import time +import torch +import os +import os.path as osp +import imageio +import glob +import open3d as o3d +import numpy as np +import math +from models.Mask3D.mask3d import load_mesh, load_ply +import colorsys +from tqdm import tqdm + +def get_iou(masks): + masks = masks.float() + intersection = torch.einsum('ij,kj -> ik', masks, masks) + num_masks = masks.shape[0] + masks_batch_size = 2 # scannet 200: 20 + if masks_batch_size < num_masks: + ratio = num_masks//masks_batch_size + remaining = num_masks-ratio*masks_batch_size + start_masks = list(range(0,ratio*masks_batch_size, masks_batch_size)) + if remaining == 0: + end_masks = list(range(masks_batch_size,(ratio+1)*masks_batch_size,masks_batch_size)) + else: + end_masks = list(range(masks_batch_size,(ratio+1)*masks_batch_size,masks_batch_size)) + end_masks[-1] = num_masks + else: + start_masks = [0] + end_masks = [num_masks] + union = torch.cat([((masks[st:ed, None, :]+masks[None, :, :]) >= 1).sum(-1) for st,ed in zip(start_masks, end_masks)]) + iou = torch.div(intersection,union) + + return iou + +def apply_nms(masks, scores, nms_th): + masks = masks.permute(1,0) + scored_sorted, sorted_scores_indices = torch.sort(scores, descending=True) + inv_sorted_scores_indices = {sorted_id.item(): id for id, sorted_id in enumerate(sorted_scores_indices)} + maskes_sorted = masks[sorted_scores_indices] + iou = get_iou(maskes_sorted) + available_indices = torch.arange(len(scored_sorted)) + for indx in range(len(available_indices)): + remove_indices = torch.where(iou[indx,indx+1:] > nms_th)[0] + available_indices[indx+1:][remove_indices] = 0 + remaining = available_indices.unique() + keep_indices = torch.tensor([inv_sorted_scores_indices[id.item()] for id in remaining]) + return keep_indices + +def generate_vibrant_colors(num_colors): + colors = [] + hue_increment = 1.0 / num_colors + saturation = 1.0 + value = 1.0 + + for i in range(num_colors): + hue = i * hue_increment + rgb = colorsys.hsv_to_rgb(hue, saturation, value) + colors.append(rgb) + + return colors + +def get_visibility_mat(pred_masks_3d, inside_mask, topk = 15): + intersection = torch.einsum("ik, fk -> if", pred_masks_3d.float(), inside_mask.float()) + total_point_number = pred_masks_3d[:, None, :].float().sum(dim = -1) + visibility_matrix = intersection/total_point_number + + if topk > visibility_matrix.shape[-1]: + topk = visibility_matrix.shape[-1] + + max_visiblity_in_frame = torch.topk(visibility_matrix, topk, dim = -1).indices + + visibility_matrix_bool = torch.zeros_like(visibility_matrix).bool() + visibility_matrix_bool[torch.tensor(range(len(visibility_matrix_bool)))[:, None],max_visiblity_in_frame] = True + + return visibility_matrix_bool + +def compute_iou(box, boxes): + assert box.shape == (4,), "Reference box must be of shape (4,)" + assert boxes.shape[1] == 4, "Boxes must be of shape (N, 4)" + + x1_inter = torch.max(box[0], boxes[:, 0]) + y1_inter = torch.max(box[1], boxes[:, 1]) + x2_inter = torch.min(box[2], boxes[:, 2]) + y2_inter = torch.min(box[3], boxes[:, 3]) + inter_area = (x2_inter - x1_inter).clamp(0) * (y2_inter - y1_inter).clamp(0) + box_area = (box[2] - box[0]) * (box[3] - box[1]) + boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + union_area = box_area + boxes_area - inter_area + iou = inter_area / union_area + + return iou + +class OpenYolo3D(): + def __init__(self, openyolo3d_config = ""): + config = load_yaml(openyolo3d_config) + self.network_3d = Network_3D(config) + self.network_2d = Network_2D(config) + self.openyolo3d_config = config + + def predict(self, path_2_scene_data, depth_scale, processed_scene = None, path_to_3d_masks = None, is_gt=False): + self.world2cam = WORLD_2_CAM(path_2_scene_data, depth_scale, self.openyolo3d_config) + self.mesh_projections = self.world2cam.get_mesh_projections() + self.scaling_params = [self.world2cam.depth_resolution[0]/self.world2cam.image_resolution[0], self.world2cam.depth_resolution[1]/self.world2cam.image_resolution[1]] + + scene_name = path_2_scene_data.split("/")[-1] + print("[🚀 ACTION] 3D mask proposals computation ...") + start = time.time() + + if path_to_3d_masks is None: + self.preds_3d = self.network_3d.get_class_agnostic_masks(self.world2cam.mesh) if processed_scene is None else self.network_3d.get_class_agnostic_masks(processed_scene) + keep_score = self.preds_3d[1] >= self.openyolo3d_config["network3d"]["th"] + keep_nms = apply_nms(self.preds_3d[0][:, keep_score].cuda(), self.preds_3d[1][keep_score].cuda(), self.openyolo3d_config["network3d"]["nms"]) + self.preds_3d = (self.preds_3d[0].cpu().permute(1,0)[keep_score][keep_nms].permute(1,0), self.preds_3d[1].cpu()[keep_score][keep_nms]) + else: + self.preds_3d = torch.load(osp.join(path_to_3d_masks, f"{scene_name}.pt")) + + print(f"[🕒 INFO] Elapsed time {(time.time()-start)}") + print(f"[✅ INFO] Proposals computed.") + + print("[🚀 ACTION] 2D Bounding Boxes computation ...") + start = time.time() + self.preds_2d = self.network_2d.get_bounding_boxes(self.world2cam.color_paths) + # self.preds_2d = torch.load(osp.join(f"/share/data/drive_3/OpenYolo3D/bboxes_2d", f"{scene_name}.pt")) + print(f"[🕒 INFO] Elapsed time {(time.time()-start)}") + print(f"[✅ INFO] Bounding boxes computed.") + + print("[🚀 ACTION] Predicting ...") + start = time.time() + prediction = self.label_3d_masks_from_2d_bboxes(scene_name, is_gt) + print(f"[🕒 INFO] Elapsed time {(time.time()-start)}") + print(f"[✅ INFO] Prediction completed") + + return prediction + + def label_3d_masks_from_2d_bboxes(self, scene_name, is_gt=False): + projections_mesh_to_frame , keep_visible_points = self.mesh_projections + predictions_2d_bboxes = self.preds_2d + prediction_3d_masks, _ = self.preds_3d + + predicted_masks, predicated_classes, predicated_scores = self.label_3d_masks_from_label_maps(prediction_3d_masks.bool(), + predictions_2d_bboxes, + projections_mesh_to_frame, + keep_visible_points, + is_gt) + + self.predicted_masks = predicted_masks + self.predicated_scores = predicated_scores + self.predicated_classes = predicated_classes + + return {scene_name : (predicted_masks, predicated_classes, predicated_scores)} + + + def label_3d_masks_from_label_maps(self, + prediction_3d_masks, + predictions_2d_bboxes, + projections_mesh_to_frame, + keep_visible_points, + is_gt): + + label_maps = self.construct_label_maps(predictions_2d_bboxes) #construct the label maps , start from the biggest bbox to small one + + visibility_matrix = get_visibility_mat(prediction_3d_masks.cuda().permute(1,0), keep_visible_points.cuda(), topk = 25 if is_gt else self.openyolo3d_config["openyolo3d"]["topk"]) + valid_frames = visibility_matrix.sum(dim=0) >= 1 + + prediction_3d_masks = prediction_3d_masks.permute(1,0).cpu() + prediction_3d_masks_np = prediction_3d_masks.numpy() + projections_mesh_to_frame = projections_mesh_to_frame[valid_frames].cpu().numpy() + visibility_matrix = visibility_matrix[:, valid_frames].cpu().numpy() + keep_visible_points = keep_visible_points[valid_frames].cpu().numpy() + distributions = [] + + class_labels = [] + class_probs = [] + class_dists = [] + label_maps = label_maps[valid_frames].numpy() + bounding_boxes = predictions_2d_bboxes.values() + bounding_boxes_valid = [bbox for (bi, bbox) in enumerate(bounding_boxes) if valid_frames[bi]] + for mask_id, mask in enumerate(prediction_3d_masks_np): + prob_normalizer = 0 + + representitive_frame_ids = np.where(visibility_matrix[mask_id])[0] + labels_distribution = [] + iou_vals = [] + for representitive_frame_id in representitive_frame_ids: + visible_points_mask = (keep_visible_points[representitive_frame_id].squeeze()*mask).astype(bool) + prob_normalizer += visible_points_mask.sum() + instance_x_y_coords = projections_mesh_to_frame[representitive_frame_id][np.where(visible_points_mask)].astype(np.int64) + + boxes = bounding_boxes_valid[representitive_frame_id]["bbox"].long() + if len(boxes) > 0 and len(instance_x_y_coords > 10): + x_l, x_r, y_t, y_b = instance_x_y_coords[:, 0].min(), instance_x_y_coords[:, 0].max()+1, instance_x_y_coords[:, 1].min(), instance_x_y_coords[:, 1].max()+1 + box = torch.tensor([x_l, y_t, x_r, y_b]) + + iou_values = compute_iou(box, boxes) + iou_vals.append(iou_values.max().item()) + selected_labels = label_maps[representitive_frame_id, instance_x_y_coords[:, 1], instance_x_y_coords[:, 0]] + labels_distribution.append(selected_labels) + + labels_distribution = np.concatenate(labels_distribution) if len(labels_distribution) > 0 else np.array([-1]) + + # class_dists.append(labels_distribution) + distribution = torch.zeros(self.openyolo3d_config["openyolo3d"]["num_classes"]) if self.openyolo3d_config["openyolo3d"]["topk_per_image"] != -1 else None + if (labels_distribution != -1).sum() != 0: + + if distribution is not None: + all_labels = torch.from_numpy(labels_distribution[labels_distribution != -1]) + all_labels_unique = all_labels.unique() + for lb in all_labels_unique: + distribution[lb] = (all_labels == lb).sum() + + distribution = distribution/distribution.max() + + class_label = torch.mode(torch.from_numpy(labels_distribution[labels_distribution != -1])).values.item() + class_prob = (labels_distribution == class_label).sum()/prob_normalizer + else: + if distribution is not None: + distribution[-1] = 1.0 + class_label = -1 + class_prob = 0.0 + + iou_vals = torch.tensor(iou_vals) + + class_labels.append(class_label) + if (iou_vals != 0).sum(): + iou_prob = iou_vals[iou_vals != 0].mean().item() + else: + iou_prob = 0.0 + + class_probs.append(class_prob*iou_prob) + if distribution is not None: + distributions.append(distribution) + + pred_classes = torch.tensor(class_labels) + pred_scores = torch.tensor(class_probs) + if distribution is not None: + distributions = torch.stack(distributions) if len(distributions) > 0 else torch.tensor((0, self.openyolo3d_config["openyolo3d"]["num_classes"])) + + if (self.openyolo3d_config["openyolo3d"]["topk_per_image"] != -1) and (not is_gt): + # print("TOPK USED") + n_instance = distributions.shape[0] + distributions = distributions.reshape(-1) + labels = ( + torch.arange(self.openyolo3d_config["openyolo3d"]["num_classes"], device=distributions.device) + .unsqueeze(0) + .repeat(n_instance, 1) + .flatten(0, 1) + ) + + cur_topk = self.openyolo3d_config["openyolo3d"]["topk_per_image"] + _, idx = torch.topk(distributions, k=min(cur_topk, len(distributions)), largest=True) + mask_idx = torch.div(idx, self.openyolo3d_config["openyolo3d"]["num_classes"], rounding_mode="floor") + + pred_classes = labels[idx] + pred_scores = distributions[idx].cuda() + prediction_3d_masks = prediction_3d_masks[mask_idx] + + return prediction_3d_masks.permute(1,0), pred_classes, pred_scores + + def construct_label_maps(self, predictions_2d_bboxes, save_label_map=False): + label_maps = (torch.ones((len(predictions_2d_bboxes), self.world2cam.height, self.world2cam.width))*-1).type(torch.int16) + for frame_id, pred in enumerate(predictions_2d_bboxes.values()): + bboxes = pred["bbox"].long() + labels = pred["labels"].type(torch.int16) + + bboxes[:,0] = bboxes[:,0]*self.scaling_params[1] + bboxes[:,2] = bboxes[:,2]*self.scaling_params[1] + bboxes[:,1] = bboxes[:,1]*self.scaling_params[0] + bboxes[:,3] = bboxes[:,3]*self.scaling_params[0] + bboxes_weights = (bboxes[:,2]-bboxes[:,0])+(bboxes[:,3]-bboxes[:,1]) + sorted_indices = bboxes_weights.sort(descending=True).indices + bboxes = bboxes[sorted_indices] + labels = labels[sorted_indices] + for id, bbox in enumerate(bboxes): + label_maps[frame_id, bbox[1]:bbox[3],bbox[0]:bbox[2]] = labels[id] + + return label_maps + + def save_output_as_ply(self, save_path, highest_score = True): + if highest_score : + th = self.predicated_scores.max() + else: + th = self.predicated_scores.max()-0.1 + + mesh = load_mesh(self.world2cam.mesh) + vertex_colors = np.asarray(mesh.vertex_colors) + vibrant_colors = generate_vibrant_colors(len(self.predicated_scores[self.predicated_scores >= th])) + color_id = 0 + for i, class_id in enumerate(self.predicated_classes): + if self.predicated_scores[i] < th: + continue + if len(vibrant_colors) == 0: + break + mask = self.predicted_masks.permute(1,0)[i] + vertex_colors[mask] = np.array(vibrant_colors.pop()) + color_id += 1 + mesh.vertex_colors = o3d.utility.Vector3dVector(vertex_colors) + o3d.io.write_triangle_mesh(save_path, mesh) + + + +class WORLD_2_CAM(): + def __init__(self, path_2_scene, depth_scale, openyolo3d_config = None): + self.poses = {} + self.intrinsics = {} + self.meshes = {} + self.depth_maps_paths = {} + self.depth_color_paths = {} + self.vis_depth_threshold = openyolo3d_config["openyolo3d"]['vis_depth_threshold'] + + frequency = openyolo3d_config["openyolo3d"]['frequency'] + + path_2_poses = osp.join(path_2_scene,"poses") + num_frames = len(os.listdir(path_2_poses)) + self.poses = [osp.join(path_2_poses, f"{i}.txt") for i in list(range(num_frames))[::frequency]] + + path_2_intrinsics = osp.join(path_2_scene,"intrinsics.txt") + self.intrinsics = [path_2_intrinsics for i in list(range(num_frames))[::frequency]] + + self.mesh = glob.glob(path_2_scene+"/*.ply")[0] + + path_2_depth = osp.join(path_2_scene,"depth") + self.depth_maps_paths = [osp.join(path_2_depth, f"{i}.png") for i in list(range(num_frames))[::frequency]] + + path_2_color = osp.join(path_2_scene,"color") + self.color_paths = [osp.join(path_2_color, f"{i}.jpg") for i in list(range(num_frames))[::frequency]] + + + self.image_resolution = imageio.imread(list(self.color_paths)[0]).shape[:2] + self.depth_resolution = imageio.imread(list(self.depth_maps_paths)[0]).shape + self.height = self.depth_resolution[0] + self.width = self.depth_resolution[1] + + self.depth_scale = depth_scale + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + @staticmethod + def load_ply(path_2_mesh): + pcd = o3d.io.read_point_cloud(path_2_mesh) + points = np.asarray(pcd.points) + colors = np.asarray(pcd.colors) + # print(points.shape) + coords = np.concatenate([points, np.ones((points.shape[0], 1))], axis = -1) + return coords, colors + + def load_depth_maps(self): + depth_maps = [] + paths_to_depth_maps_scene_i = self.depth_maps_paths + for depth_map_path_i in paths_to_depth_maps_scene_i: + depth_path = os.path.join(depth_map_path_i) + depth_maps.append(torch.from_numpy(imageio.imread(depth_path) / self.depth_scale).to(self.device)) + return torch.stack(depth_maps) + + def adjust_intrinsic(self, intrinsic, original_resolution, new_resolution): + if original_resolution == new_resolution: + return intrinsic + + resize_width = int(math.floor(new_resolution[1] * float( + original_resolution[0]) / float(original_resolution[1]))) + + adapted_intrinsic = intrinsic.copy() + adapted_intrinsic[0, 0] *= float(resize_width) / float(original_resolution[0]) + adapted_intrinsic[1, 1] *= float(new_resolution[1]) / float(original_resolution[1]) + adapted_intrinsic[0, 2] *= float(new_resolution[0] - 1) / float(original_resolution[0] - 1) + adapted_intrinsic[1, 2] *= float(new_resolution[1] - 1) / float(original_resolution[1] - 1) + return adapted_intrinsic + + def get_mesh_projections(self): + N_Large = 2000000*250 + + points, colors = self.load_ply(self.mesh) + points, colors = torch.from_numpy(points).cuda(), torch.from_numpy(colors).cuda() + + intrinsic = self.adjust_intrinsic(np.loadtxt(self.intrinsics[0]), self.image_resolution, self.depth_resolution) + intrinsics = torch.from_numpy(np.stack([intrinsic for frame_id in range(len(self.poses))])).cuda() + extrinsics = torch.linalg.inv(torch.from_numpy(np.stack([np.loadtxt(pose) for pose in self.poses])).cuda()) + + if extrinsics.shape[0]*points.shape[0] < N_Large: + word2cam_mat = torch.einsum('bij, jk -> bik',torch.einsum('bij,bjk -> bik', intrinsics,extrinsics), points.T).permute(0,2,1) + else: + B_size = 800000 + Num_Points = points.shape[0] + Num_batches = Num_Points//B_size+1 + word2cam_mat = [] + for b_i in range(Num_batches): + dim_start = b_i*B_size + dim_last = (b_i+1)*B_size if b_i != Num_batches-1 else points.shape[0] + word2cam_mat_i = torch.einsum('bij, jk -> bik',torch.einsum('bij,bjk -> bik', intrinsics,extrinsics), points[dim_start:dim_last].T).permute(0,2,1) + word2cam_mat.append(word2cam_mat_i.cpu()) + word2cam_mat = torch.cat(word2cam_mat, dim = 1) + del intrinsics + del extrinsics + del points + del colors + torch.cuda.empty_cache() + + point_depth = word2cam_mat[:, :, 2].cuda() + if word2cam_mat.shape[1]*word2cam_mat.shape[0] < N_Large: + size = (word2cam_mat.shape[0], word2cam_mat.shape[1]) + mask = (word2cam_mat[:, :, 2] != 0).reshape(size[0]*size[1]) + + projected_points = torch.stack([(word2cam_mat[:, :, 0].reshape(size[0]*size[1])[mask]/word2cam_mat[:, :, 2].reshape(size[0]*size[1])[mask]).reshape(size), + (word2cam_mat[:, :, 1].reshape(size[0]*size[1])[mask]/word2cam_mat[:, :, 2].reshape(size[0]*size[1])[mask]).reshape(size)]).permute(1,2,0).long() + inside_mask = ((projected_points[:,:,0] < self.width)*(projected_points[:,:,0] > 0)*(projected_points[:,:,1] < self.height)*(projected_points[:,:,1] >0) == 1 ) + + else: + B_size = 200000 + Num_Points = word2cam_mat.shape[1] + Num_batches = Num_Points//B_size+1 + projected_points = [] + + for b_i in range(Num_batches): + dim_start = b_i*B_size + dim_last = (b_i+1)*B_size if b_i != Num_batches-1 else word2cam_mat.shape[1] + batch_z = word2cam_mat[:, dim_start:dim_last, 2].cuda() + batch_y = word2cam_mat[:, dim_start:dim_last, 1].cuda() + batch_x = word2cam_mat[:, dim_start:dim_last, 0].cuda() + + size = (word2cam_mat.shape[0], dim_last-dim_start) + mask = (batch_z != 0).reshape(size[0]*size[1]) + projected_points_i = torch.stack([(torch.div(batch_x.reshape(size[0]*size[1])[mask],batch_z.reshape(size[0]*size[1])[mask])).reshape(size), + (torch.div(batch_y.reshape(size[0]*size[1])[mask],batch_z.reshape(size[0]*size[1])[mask])).reshape(size)]).permute(1,2,0).long() + projected_points.append(projected_points_i.cpu()) + + + + # merge parts + projected_points = torch.cat(projected_points, dim = 1) + inside_mask = ((projected_points[:,:,0] < self.width)*(projected_points[:,:,0] > 0)*(projected_points[:,:,1] < self.height)*(projected_points[:,:,1] >0) == 1 ) + + + # Get visible points with depth, width, and height + depth_maps = self.load_depth_maps() + num_frames = depth_maps.shape[0] + # pixel_to_3d_point = [] + for frame_id in range(num_frames): + points_in_frame_mask = inside_mask[frame_id].clone() + points_in_frame = (projected_points[frame_id][points_in_frame_mask]) + depth_in_frame = point_depth[frame_id][points_in_frame_mask] + visibility_mask = (torch.abs(depth_maps[frame_id][points_in_frame[:,1].long(), points_in_frame[:,0].long()] + - depth_in_frame) <= \ + self.vis_depth_threshold) + + inside_mask[frame_id][points_in_frame_mask] = visibility_mask.to(inside_mask.device) + + return projected_points.type(torch.int16).cpu(), inside_mask.cpu() \ No newline at end of file diff --git a/utils/utils_2d.py b/utils/utils_2d.py new file mode 100644 index 0000000000000000000000000000000000000000..37d9ebc62d4d0aa217f97f5b5ab84642327c6794 --- /dev/null +++ b/utils/utils_2d.py @@ -0,0 +1,110 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import os +import cv2 +import os.path as osp +from torchvision.ops import nms +import torch +from mmengine.runner.amp import autocast +from tqdm import tqdm +import yaml +from PIL import Image +from mmengine.dataset import Compose +from mmyolo.registry import RUNNERS +from mmengine.config import Config, DictAction +from mmengine.runner import Runner +import supervision as sv + +def load_yaml(path): + with open(path) as stream: + try: + config = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + return config + +def get_image_resolution(image_path): + """ + Get the resolution of an image. + + :param image_path: Path to the image file + :return: A tuple containing the width and height of the image + """ + with Image.open(image_path) as img: + width, height = img.size + return width, height + +class Network_2D(): + def __init__(self, config): + self.texts = [[t] for t in config["network2d"]["text_prompts"]] + [[' ']] + self.topk = config["network2d"]["topk"] + self.th = config["network2d"]["th"] + self.nms = config["network2d"]["nms"] + self.use_amp = config["network2d"]["use_amp"] + self.resolution = None + self.frequency = config["openyolo3d"]["frequency"] + cfg = Config.fromfile(os.path.join(os.getcwd(), config["network2d"]["config_path"])) + cfg.work_dir = osp.join(f'{os.getcwd()}/models/YOLO-World/yolo_world/work_dirs', + osp.splitext(config["network2d"]["config_path"])[0].split("/")[-1]) + cfg.load_from = os.path.join(os.getcwd(), config["network2d"]["pretrained_path"]) + if 'runner_type' not in cfg: + self.runner = Runner.from_cfg(cfg) + else: + self.runner = RUNNERS.build(cfg) + + self.runner.call_hook('before_run') + self.runner.load_or_resume() + pipeline = cfg.test_dataloader.dataset.pipeline + self.runner.pipeline = Compose(pipeline) + self.runner.model.eval() + + def get_bounding_boxes(self, path_2_images): + print(f"Infering from {len(path_2_images)} images") + + scene_preds = {} + for image_path in tqdm(path_2_images): + frame_prediction = self.inference_detector([image_path]) + scene_preds.update(frame_prediction) + return scene_preds + + def inference_detector(self, images_batch): + if self.resolution is None: + self.resolution = get_image_resolution(images_batch[0]) + inputs = [] + data_samples = [] + for img_id, image_path in enumerate(images_batch): + data_info = dict(img_id=img_id, img_path=image_path, texts=self.texts) + data_info = self.runner.pipeline(data_info) + inputs.append(data_info['inputs']) + data_samples.append(data_info['data_samples']) + + + data_batch = dict(inputs=torch.stack(inputs), + data_samples=data_samples) + + with autocast(enabled=self.use_amp), torch.no_grad(): + output = self.runner.model.test_step(data_batch) + frame_prediction = {} + + for img_id, image_path in enumerate(images_batch): + with autocast(enabled=self.use_amp), torch.no_grad(): + pred_instances = output[img_id].pred_instances + keep = nms(pred_instances.bboxes, pred_instances.scores, iou_threshold=self.nms) + pred_instances = pred_instances[keep] + pred_instances = pred_instances[pred_instances.scores.float() > self.th] + + if len(pred_instances.scores) > self.topk: + indices = pred_instances.scores.float().topk(self.topk)[1] + pred_instances = pred_instances[indices] + mask = ~(((pred_instances['bboxes'][:,2]-pred_instances['bboxes'][:,0] > self.resolution[0]-50)*(pred_instances['bboxes'][:,3]-pred_instances['bboxes'][:,1] > self.resolution[1]-50)) == 1) + bboxes_ = pred_instances['bboxes'][mask].cpu() + labels_ = pred_instances['labels'][mask].cpu() + scores_ = pred_instances['scores'][mask].cpu() + frame_id = osp.basename(image_path).split(".")[0] + + frame_prediction.update({frame_id:{"bbox":bboxes_, "labels":labels_, "scores":scores_}}) + + return frame_prediction + + + + diff --git a/utils/utils_3d.py b/utils/utils_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..950e88c27f9f211fabd16c2ff5fb69226bae604e --- /dev/null +++ b/utils/utils_3d.py @@ -0,0 +1,19 @@ + +import sys +sys.path.append("..") +from models.Mask3D.mask3d import get_model, load_mesh, prepare_data, map_output_to_pointcloud, save_colorized_mesh +import torch + +class Network_3D(): + def __init__(self, config): + self.model = get_model(config["network3d"]["pretrained_path"]) + self.model.eval() + self.device = torch.device("cuda:0") + self.model.to(self.device) + + def get_class_agnostic_masks(self, pointcloud_file, point2segment=None): + data, points, colors, features, unique_map, inverse_map, point2segment, point2segment_full = prepare_data(pointcloud_file, self.device) + with torch.no_grad(): + outputs = self.model(data, raw_coordinates=features, point2segment=[point2segment] if point2segment is not None else None) + return map_output_to_pointcloud(outputs, inverse_map, point2segment, point2segment_full) + \ No newline at end of file