Spaces:

mshukor
/

eP-ALM-Audio-Text

Build error

App Files Files Community

mshukor commited on Jul 15, 2023

Commit

33f1db4

1 Parent(s): 7e2d7dc

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +21 -0
README.md +7 -8
TimeSformer/.gitignore +143 -0
TimeSformer/CODE_OF_CONDUCT.md +5 -0
TimeSformer/CONTRIBUTING.md +25 -0
TimeSformer/LICENSE +399 -0
TimeSformer/README.md +248 -0
TimeSformer/configs/Kinetics/SLOWFAST_4x16_R50.yaml +63 -0
TimeSformer/configs/Kinetics/SLOWFAST_8x8_R101.yaml +63 -0
TimeSformer/configs/Kinetics/SLOWFAST_8x8_R50.yaml +63 -0
TimeSformer/configs/Kinetics/TimeSformer_divST_16x16_448.yaml +45 -0
TimeSformer/configs/Kinetics/TimeSformer_divST_8x32_224.yaml +45 -0
TimeSformer/configs/Kinetics/TimeSformer_divST_8x32_224_4gpus.yaml +45 -0
TimeSformer/configs/Kinetics/TimeSformer_divST_8x32_224_TEST.yaml +46 -0
TimeSformer/configs/Kinetics/TimeSformer_divST_96x4_224.yaml +45 -0
TimeSformer/configs/Kinetics/TimeSformer_jointST_8x32_224.yaml +45 -0
TimeSformer/configs/Kinetics/TimeSformer_spaceOnly_8x32_224.yaml +45 -0
TimeSformer/configs/SSv2/SLOWFAST_16x8_R50.yaml +83 -0
TimeSformer/configs/SSv2/TimeSformer_divST_16_448.yaml +48 -0
TimeSformer/configs/SSv2/TimeSformer_divST_64_224.yaml +48 -0
TimeSformer/configs/SSv2/TimeSformer_divST_8_224.yaml +48 -0
TimeSformer/environment.yml +26 -0
TimeSformer/example.ipynb +84 -0
TimeSformer/setup.cfg +23 -0
TimeSformer/setup.py +23 -0
TimeSformer/slurm_scripts/run_multi_node_job.sh +25 -0
TimeSformer/slurm_scripts/run_single_node_job.sh +35 -0
TimeSformer/timesformer/__init__.py +5 -0
TimeSformer/timesformer/config/__init__.py +1 -0
TimeSformer/timesformer/config/defaults.py +820 -0
TimeSformer/timesformer/datasets/DATASET.md +26 -0
TimeSformer/timesformer/datasets/__init__.py +5 -0
TimeSformer/timesformer/datasets/build.py +30 -0
TimeSformer/timesformer/datasets/cv2_transform.py +796 -0
TimeSformer/timesformer/datasets/decoder.py +392 -0
TimeSformer/timesformer/datasets/kinetics.py +294 -0
TimeSformer/timesformer/datasets/loader.py +134 -0
TimeSformer/timesformer/datasets/multigrid_helper.py +78 -0
TimeSformer/timesformer/datasets/ssv2.py +278 -0
TimeSformer/timesformer/datasets/transform.py +459 -0
TimeSformer/timesformer/datasets/utils.py +380 -0
TimeSformer/timesformer/datasets/video_container.py +31 -0
TimeSformer/timesformer/models/__init__.py +5 -0
TimeSformer/timesformer/models/batchnorm_helper.py +217 -0
TimeSformer/timesformer/models/build.py +54 -0
TimeSformer/timesformer/models/conv2d_same.py +74 -0
TimeSformer/timesformer/models/custom_video_model_builder.py +4 -0
TimeSformer/timesformer/models/features.py +266 -0
TimeSformer/timesformer/models/head_helper.py +235 -0
TimeSformer/timesformer/models/helpers.py +360 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 mshukor
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
 ---
-title: EP ALM Audio Text
-emoji: 🚀
 colorFrom: purple
-colorTo: green
 sdk: gradio
-sdk_version: 3.36.1
 app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: eP-ALM
+emoji: 🌍
 colorFrom: purple
+colorTo: pink
 sdk: gradio
+sdk_version: 3.12.0
 app_file: app.py
+pinned: true
+license: apache-2.0
+---

TimeSformer/.gitignore ADDED Viewed

	@@ -0,0 +1,143 @@

+# Docker file from Python is inspired from here :
+# https://github.com/github/gitignore/blob/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+tests/report/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/

TimeSformer/CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# Code of Conduct
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.

TimeSformer/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# Contributing to TimeSformer
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+## License
+By contributing to TimeSformer, you agree that your contributions will be licensed
+under the [LICENSE.md](LICENSE.md) file in the root directory of this source tree.

TimeSformer/LICENSE ADDED Viewed

	@@ -0,0 +1,399 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

TimeSformer/README.md ADDED Viewed

	@@ -0,0 +1,248 @@

+# TimeSformer
+This is an official pytorch implementation of our ICML 2021 paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/pdf/2102.05095.pdf). In this repository, we provide PyTorch code for training and testing our proposed TimeSformer model. TimeSformer provides an efficient video classification framework that achieves state-of-the-art results on several video action recognition benchmarks such as Kinetics-400.
+If you find TimeSformer useful in your research, please use the following BibTeX entry for citation.
+```BibTeX
+@inproceedings{gberta_2021_ICML,
+    author  = {Gedas Bertasius and Heng Wang and Lorenzo Torresani},
+    title = {Is Space-Time Attention All You Need for Video Understanding?},
+    booktitle   = {Proceedings of the International Conference on Machine Learning (ICML)},
+    month = {July},
+    year = {2021}
+}
+```
+# Model Zoo
+We provide TimeSformer models pretrained on Kinetics-400 (K400), Kinetics-600 (K600), Something-Something-V2 (SSv2), and HowTo100M datasets.
+| name | dataset | # of frames | spatial crop | acc@1 | acc@5 | url |
+| --- | --- | --- | --- | --- | --- | --- |
+| TimeSformer | K400 | 8 | 224 | 77.9 | 93.2 | [model](https://www.dropbox.com/s/g5t24we9gl5yk88/TimeSformer_divST_8x32_224_K400.pyth?dl=0) |
+| TimeSformer-HR | K400 | 16 | 448 | 79.6 | 94.0 | [model](https://www.dropbox.com/s/6f0x172lpqy3oxt/TimeSformer_divST_16x16_448_K400.pyth?dl=0) |
+| TimeSformer-L | K400 | 96 | 224 | 80.6 | 94.7 | [model](https://www.dropbox.com/s/r1iuxahif3sgimo/TimeSformer_divST_96x4_224_K400.pyth?dl=0) |
+| name | dataset | # of frames | spatial crop | acc@1 | acc@5 | url |
+| --- | --- | --- | --- | --- | --- | --- |
+| TimeSformer | K600 | 8 | 224 | 79.1 | 94.4 | [model](https://www.dropbox.com/s/4h2qt41m2z3aqrb/TimeSformer_divST_8x32_224_K600.pyth?dl=0) |
+| TimeSformer-HR | K600 | 16 | 448 | 81.8 | 95.8 | [model](https://www.dropbox.com/s/ft1e92g2vhvxecv/TimeSformer_divST_16x16_448_K600.pyth?dl=0) |
+| TimeSformer-L | K600 | 96 | 224 | 82.2 | 95.6 | [model](https://www.dropbox.com/s/857rx6xeclxfhdg/TimeSformer_divST_96x4_224_K600.pyth?dl=0) |
+| name | dataset | # of frames | spatial crop | acc@1 | acc@5 | url |
+| --- | --- | --- | --- | --- | --- | --- |
+| TimeSformer | SSv2 | 8 | 224 | 59.1 | 85.6 | [model](https://www.dropbox.com/s/tybhuml57y24wpm/TimeSformer_divST_8_224_SSv2.pyth?dl=0) |
+| TimeSformer-HR | SSv2 | 16 | 448 | 61.8 | 86.9 | [model](https://www.dropbox.com/s/9t68uzk8w2fpfnv/TimeSformer_divST_16_448_SSv2.pyth?dl=0) |
+| TimeSformer-L | SSv2 | 64 | 224 | 62.0 | 87.5 | [model](https://www.dropbox.com/s/3f1rm2al8mhprwa/TimeSformer_divST_64_224_SSv2.pyth?dl=0) |
+| name | dataset | # of frames | spatial crop | single clip coverage | acc@1 | url |
+| --- | --- | --- | --- | --- | --- | --- |
+| TimeSformer | HowTo100M | 8 | 224 | 8.5s | 56.8 | [model](https://www.dropbox.com/s/9v8hcm88b9tc6ff/TimeSformer_divST_8x32_224_HowTo100M.pyth?dl=0) |
+| TimeSformer | HowTo100M | 32 | 224 | 34.1s | 61.2 | [model](https://www.dropbox.com/s/4roflx4q1gscu85/TimeSformer_divST_32x32_224_HowTo100M.pyth?dl=0) |
+| TimeSformer | HowTo100M | 64 | 448 | 68.3s | 62.2 | [model](https://www.dropbox.com/s/15bvqltl1j5vyp3/TimeSformer_divST_64x32_224_HowTo100M.pyth?dl=0) |
+| TimeSformer | HowTo100M | 96 | 224 | 102.4s | 62.6 | [model](https://www.dropbox.com/s/t2mzgahnfhgakma/TimeSformer_divST_96x32_224_HowTo100M.pyth?dl=0) |
+We note that these models were re-trained using a slightly different implementation than the one used in the paper. Therefore, there might be a small difference in performance compared to the results reported in the paper.
+You can load the pretrained models as follows:
+```python
+import torch
+from timesformer.models.vit import TimeSformer
+model = TimeSformer(img_size=224, num_classes=400, num_frames=8, attention_type='divided_space_time',  pretrained_model='/path/to/pretrained/model.pyth')
+dummy_video = torch.randn(2, 3, 8, 224, 224) # (batch x channels x frames x height x width)
+pred = model(dummy_video,) # (2, 400)
+```
+# Installation
+First, create a conda virtual environment and activate it:
+```
+conda create -n timesformer python=3.7 -y
+source activate timesformer
+```
+Then, install the following packages:
+- torchvision: `pip install torchvision` or `conda install torchvision -c pytorch`
+- [fvcore](https://github.com/facebookresearch/fvcore/): `pip install 'git+https://github.com/facebookresearch/fvcore'`
+- simplejson: `pip install simplejson`
+- einops: `pip install einops`
+- timm: `pip install timm`
+- PyAV: `conda install av -c conda-forge`
+- psutil: `pip install psutil`
+- scikit-learn: `pip install scikit-learn`
+- OpenCV: `pip install opencv-python`
+- tensorboard: `pip install tensorboard`
+Lastly, build the TimeSformer codebase by running:
+```
+git clone https://github.com/facebookresearch/TimeSformer
+cd TimeSformer
+python setup.py build develop
+```
+# Usage
+## Dataset Preparation
+Please use the dataset preparation instructions provided in [DATASET.md](timesformer/datasets/DATASET.md).
+## Training the Default TimeSformer
+Training the default TimeSformer that uses divided space-time attention, and operates on 8-frame clips cropped at 224x224 spatial resolution, can be done using the following command:
+```
+python tools/run_net.py \
+  --cfg configs/Kinetics/TimeSformer_divST_8x32_224.yaml \
+  DATA.PATH_TO_DATA_DIR path_to_your_dataset \
+  NUM_GPUS 8 \
+  TRAIN.BATCH_SIZE 8 \
+```
+You may need to pass location of your dataset in the command line by adding `DATA.PATH_TO_DATA_DIR path_to_your_dataset`, or you can simply add
+```
+DATA:
+  PATH_TO_DATA_DIR: path_to_your_dataset
+```
+To the yaml configs file, then you do not need to pass it to the command line every time.
+## Using a Different Number of GPUs
+If you want to use a smaller number of GPUs, you need to modify .yaml configuration files in [`configs/`](configs/). Specifically, you need to modify the NUM_GPUS, TRAIN.BATCH_SIZE, TEST.BATCH_SIZE, DATA_LOADER.NUM_WORKERS entries in each configuration file. The BATCH_SIZE entry should be the same or higher as the NUM_GPUS entry. In [`configs/Kinetics/TimeSformer_divST_8x32_224_4gpus.yaml`](configs/Kinetics/TimeSformer_divST_8x32_224_4gpus.yaml), we provide a sample configuration file for a 4 GPU setup.
+## Using Different Self-Attention Schemes
+If you want to experiment with different space-time self-attention schemes, e.g., space-only or joint space-time attention, use the following commands:
+```
+python tools/run_net.py \
+  --cfg configs/Kinetics/TimeSformer_spaceOnly_8x32_224.yaml \
+  DATA.PATH_TO_DATA_DIR path_to_your_dataset \
+  NUM_GPUS 8 \
+  TRAIN.BATCH_SIZE 8 \
+```
+and
+```
+python tools/run_net.py \
+  --cfg configs/Kinetics/TimeSformer_jointST_8x32_224.yaml \
+  DATA.PATH_TO_DATA_DIR path_to_your_dataset \
+  NUM_GPUS 8 \
+  TRAIN.BATCH_SIZE 8 \
+```
+## Training Different TimeSformer Variants
+If you want to train more powerful TimeSformer variants, e.g., TimeSformer-HR (operating on 16-frame clips sampled at 448x448 spatial resolution), and TimeSformer-L (operating on 96-frame clips sampled at 224x224 spatial resolution), use the following commands:
+```
+python tools/run_net.py \
+  --cfg configs/Kinetics/TimeSformer_divST_16x16_448.yaml \
+  DATA.PATH_TO_DATA_DIR path_to_your_dataset \
+  NUM_GPUS 8 \
+  TRAIN.BATCH_SIZE 8 \
+```
+and
+```
+python tools/run_net.py \
+  --cfg configs/Kinetics/TimeSformer_divST_96x4_224.yaml \
+  DATA.PATH_TO_DATA_DIR path_to_your_dataset \
+  NUM_GPUS 8 \
+  TRAIN.BATCH_SIZE 8 \
+```
+Note that for these models you will need a set of GPUs with ~32GB of memory.
+## Inference
+Use `TRAIN.ENABLE` and `TEST.ENABLE` to control whether training or testing is required for a given run. When testing, you also have to provide the path to the checkpoint model via TEST.CHECKPOINT_FILE_PATH.
+```
+python tools/run_net.py \
+  --cfg configs/Kinetics/TimeSformer_divST_8x32_224_TEST.yaml \
+  DATA.PATH_TO_DATA_DIR path_to_your_dataset \
+  TEST.CHECKPOINT_FILE_PATH path_to_your_checkpoint \
+  TRAIN.ENABLE False \
+```
+## Single-Node Training via Slurm
+To train TimeSformer via Slurm, please check out our single node Slurm training script [`slurm_scripts/run_single_node_job.sh`](slurm_scripts/run_single_node_job.sh).
+## Multi-Node Training via Submitit
+Distributed training is available via Slurm and submitit
+```
+pip install submitit
+```
+To train TimeSformer model on Kinetics using 4 nodes with 8 gpus each use the following command:
+```
+python tools/submit.py --cfg configs/Kinetics/TimeSformer_divST_8x32_224.yaml --job_dir  /your/job/dir/${JOB_NAME}/ --num_shards 4 --name ${JOB_NAME} --use_volta32
+```
+We provide a script for launching slurm jobs in [`slurm_scripts/run_multi_node_job.sh`](slurm_scripts/run_multi_node_job.sh).
+## Finetuning
+To finetune from an existing PyTorch checkpoint add the following line in the command line, or you can also add it in the YAML config:
+```
+TRAIN.CHECKPOINT_FILE_PATH path_to_your_PyTorch_checkpoint
+TRAIN.FINETUNE True
+```
+## HowTo100M Dataset Split
+If you want to experiment with the long-term video modeling task on HowTo100M, please download the train/test split files from [here](https://www.dropbox.com/sh/ttvsxwqypijjuda/AACmJx1CnddW6cVBoc21eSuva?dl=0).
+# Environment
+The code was developed using python 3.7 on Ubuntu 20.04. For training, we used four GPU compute nodes each node containing 8 Tesla V100 GPUs (32 GPUs in total). Other platforms or GPU cards have not been fully tested.
+# License
+The majority of this work is licensed under [CC-NC 4.0 International license](LICENSE). However portions of the project are available under separate license terms: [SlowFast](https://github.com/facebookresearch/SlowFast) and [pytorch-image-models](https://github.com/rwightman/pytorch-image-models) are licensed under the Apache 2.0 license.
+# Contributing
+We actively welcome your pull requests. Please see [CONTRIBUTING.md](CONTRIBUTING.md) and [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) for more info.
+# Acknowledgements
+TimeSformer is built on top of [PySlowFast](https://github.com/facebookresearch/SlowFast) and [pytorch-image-models](https://github.com/rwightman/pytorch-image-models) by [Ross Wightman](https://github.com/rwightman). We thank the authors for releasing their code. If you use our model, please consider citing these works as well:
+```BibTeX
+@misc{fan2020pyslowfast,
+  author =       {Haoqi Fan and Yanghao Li and Bo Xiong and Wan-Yen Lo and
+                  Christoph Feichtenhofer},
+  title =        {PySlowFast},
+  howpublished = {\url{https://github.com/facebookresearch/slowfast}},
+  year =         {2020}
+}
+```
+```BibTeX
+@misc{rw2019timm,
+  author = {Ross Wightman},
+  title = {PyTorch Image Models},
+  year = {2019},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  doi = {10.5281/zenodo.4414861},
+  howpublished = {\url{https://github.com/rwightman/pytorch-image-models}}
+}
+```

TimeSformer/configs/Kinetics/SLOWFAST_4x16_R50.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+TRAIN:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 64
+  EVAL_PERIOD: 10
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: /path/to/kinetics/
+  NUM_FRAMES: 32
+  SAMPLING_RATE: 2
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 256
+  INPUT_CHANNEL_NUM: [3, 3]
+SLOWFAST:
+  ALPHA: 8
+  BETA_INV: 8
+  FUSION_CONV_CHANNEL_RATIO: 2
+  FUSION_KERNEL_SZ: 5
+RESNET:
+  ZERO_INIT_FINAL_BN: True
+  WIDTH_PER_GROUP: 64
+  NUM_GROUPS: 1
+  DEPTH: 50
+  TRANS_FUNC: bottleneck_transform
+  STRIDE_1X1: False
+  NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
+  SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
+  SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
+NONLOCAL:
+  LOCATION: [[[], []], [[], []], [[], []], [[], []]]
+  GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
+  INSTANTIATION: dot_product
+BN:
+  USE_PRECISE_STATS: True
+  NUM_BATCHES_PRECISE: 200
+SOLVER:
+  BASE_LR: 0.8
+  LR_POLICY: cosine
+  MAX_EPOCH: 196
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  WARMUP_EPOCHS: 34.0
+  WARMUP_START_LR: 0.01
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  NUM_CLASSES: 400
+  ARCH: slowfast
+  MODEL_NAME: SlowFast
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 64
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/Kinetics/SLOWFAST_8x8_R101.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+TRAIN:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 64
+  EVAL_PERIOD: 10
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR:  /path/to/kinetics/
+  NUM_FRAMES: 32
+  SAMPLING_RATE: 2
+  TRAIN_JITTER_SCALES: [256, 340]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 256
+  INPUT_CHANNEL_NUM: [3, 3]
+SLOWFAST:
+  ALPHA: 4
+  BETA_INV: 8
+  FUSION_CONV_CHANNEL_RATIO: 2
+  FUSION_KERNEL_SZ: 5
+RESNET:
+  ZERO_INIT_FINAL_BN: True
+  WIDTH_PER_GROUP: 64
+  NUM_GROUPS: 1
+  DEPTH: 101
+  TRANS_FUNC: bottleneck_transform
+  STRIDE_1X1: False
+  NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
+  SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
+  SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
+NONLOCAL:
+  LOCATION: [[[], []], [[], []], [[], []], [[], []]]
+  GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
+  INSTANTIATION: dot_product
+BN:
+  USE_PRECISE_STATS: True
+  NUM_BATCHES_PRECISE: 200
+SOLVER:
+  BASE_LR: 0.8 ## 8 nodes
+  LR_POLICY: cosine
+  MAX_EPOCH: 196
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  WARMUP_EPOCHS: 34.0
+  WARMUP_START_LR: 0.01
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  NUM_CLASSES: 400
+  ARCH: slowfast
+  MODEL_NAME: SlowFast
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 64
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/Kinetics/SLOWFAST_8x8_R50.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+TRAIN:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 64
+  EVAL_PERIOD: 10
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: /path/to/kinetics/
+  NUM_FRAMES: 32
+  SAMPLING_RATE: 2
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 256
+  INPUT_CHANNEL_NUM: [3, 3]
+SLOWFAST:
+  ALPHA: 4
+  BETA_INV: 8
+  FUSION_CONV_CHANNEL_RATIO: 2
+  FUSION_KERNEL_SZ: 7
+RESNET:
+  ZERO_INIT_FINAL_BN: True
+  WIDTH_PER_GROUP: 64
+  NUM_GROUPS: 1
+  DEPTH: 50
+  TRANS_FUNC: bottleneck_transform
+  STRIDE_1X1: False
+  NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
+  SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
+  SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
+NONLOCAL:
+  LOCATION: [[[], []], [[], []], [[], []], [[], []]]
+  GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
+  INSTANTIATION: dot_product
+BN:
+  USE_PRECISE_STATS: True
+  NUM_BATCHES_PRECISE: 200
+SOLVER:
+  BASE_LR: 0.8
+  LR_POLICY: cosine
+  MAX_EPOCH: 196
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  WARMUP_EPOCHS: 34.0
+  WARMUP_START_LR: 0.01
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  NUM_CLASSES: 400
+  ARCH: slowfast
+  MODEL_NAME: SlowFast
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 64
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/Kinetics/TimeSformer_divST_16x16_448.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+TRAIN:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: /path/to/kinetics/
+  NUM_FRAMES: 16
+  SAMPLING_RATE: 16
+  TRAIN_JITTER_SCALES: [448, 512]
+  TRAIN_CROP_SIZE: 448
+  TEST_CROP_SIZE: 448
+  INPUT_CHANNEL_NUM: [3]
+TIMESFORMER:
+  ATTENTION_TYPE: 'divided_space_time'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 400
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/Kinetics/TimeSformer_divST_8x32_224.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+TRAIN:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: /path/to/kinetics/
+  NUM_FRAMES: 8
+  SAMPLING_RATE: 32
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+TIMESFORMER:
+  ATTENTION_TYPE: 'divided_space_time'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 400
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/Kinetics/TimeSformer_divST_8x32_224_4gpus.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+TRAIN:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 4
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: /path/to/kinetics/
+  NUM_FRAMES: 8
+  SAMPLING_RATE: 32
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+TIMESFORMER:
+  ATTENTION_TYPE: 'divided_space_time'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 400
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 4
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 4
+  PIN_MEMORY: True
+NUM_GPUS: 4
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/Kinetics/TimeSformer_divST_8x32_224_TEST.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+TRAIN:
+  ENABLE: False
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: /path/to/kinetics/
+  NUM_FRAMES: 8
+  SAMPLING_RATE: 32
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+TIMESFORMER:
+  ATTENTION_TYPE: 'divided_space_time'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 400
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+  CHECKPOINT_FILE_PATH: '/checkpoint/gedas/jobs/timesformer/kinetics_400/TimeSformer_divST_8x32_224/checkpoints/checkpoint_epoch_00025.pyth'
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/Kinetics/TimeSformer_divST_96x4_224.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+TRAIN:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: /path/to/kinetics/
+  NUM_FRAMES: 96
+  SAMPLING_RATE: 4
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+TIMESFORMER:
+  ATTENTION_TYPE: 'divided_space_time'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 400
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/Kinetics/TimeSformer_jointST_8x32_224.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+TRAIN:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: /path/to/kinetics/
+  NUM_FRAMES: 8
+  SAMPLING_RATE: 32
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+TIMESFORMER:
+  ATTENTION_TYPE: 'joint_space_time'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 400
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/Kinetics/TimeSformer_spaceOnly_8x32_224.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+TRAIN:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: /path/to/kinetics/
+  NUM_FRAMES: 8
+  SAMPLING_RATE: 32
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+TIMESFORMER:
+  ATTENTION_TYPE: 'space_only'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 400
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: kinetics
+  BATCH_SIZE: 8
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 8
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/SSv2/SLOWFAST_16x8_R50.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+TRAIN:
+  ENABLE: True
+  DATASET: ssv2
+  BATCH_SIZE: 16
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/"
+  PATH_PREFIX: "/path/to/ssv2/frames/"
+  NUM_FRAMES: 64
+  SAMPLING_RATE: 2
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 256
+  INPUT_CHANNEL_NUM: [3, 3]
+  INV_UNIFORM_SAMPLE: True
+  RANDOM_FLIP: False
+  REVERSE_INPUT_CHANNEL: True
+SLOWFAST:
+  ALPHA: 4
+  BETA_INV: 8
+  FUSION_CONV_CHANNEL_RATIO: 2
+  FUSION_KERNEL_SZ: 7
+RESNET:
+  SPATIAL_STRIDES: [[1, 1], [2, 2], [2, 2], [2, 2]]
+  SPATIAL_DILATIONS: [[1, 1], [1, 1], [1, 1], [1, 1]]
+  ZERO_INIT_FINAL_BN: True
+  WIDTH_PER_GROUP: 64
+  NUM_GROUPS: 1
+  DEPTH: 50
+  TRANS_FUNC: bottleneck_transform
+  STRIDE_1X1: False
+  NUM_BLOCK_TEMP_KERNEL: [[3, 3], [4, 4], [6, 6], [3, 3]]
+NONLOCAL:
+  LOCATION: [[[], []], [[], []], [[], []], [[], []]]
+  GROUP: [[1, 1], [1, 1], [1, 1], [1, 1]]
+  INSTANTIATION: dot_product
+BN:
+  USE_PRECISE_STATS: True
+  NUM_BATCHES_PRECISE: 200
+  NORM_TYPE: sync_batchnorm
+  NUM_SYNC_DEVICES: 4
+SOLVER:
+  BASE_LR: 0.2 #8 nodes
+  LR_POLICY: cosine
+  MAX_EPOCH: 200
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  WARMUP_EPOCHS: 34.0
+  WARMUP_START_LR: 0.01
+  OPTIMIZING_METHOD: sgd
+  #SOLVER:
+  #  BASE_LR: 0.03
+  #  LR_POLICY: steps_with_relative_lrs
+  #  LRS: [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
+  #  STEPS: [0, 14, 18]
+  #  MAX_EPOCH: 22
+  #  MOMENTUM: 0.9
+  #  WEIGHT_DECAY: 1e-6
+  #  WARMUP_EPOCHS: 0.19
+  #  WARMUP_START_LR: 0.0001
+  #  OPTIMIZING_METHOD: sgd
+MODEL:
+  NUM_CLASSES: 174
+  ARCH: slowfast
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: ssv2
+  BATCH_SIZE: 16
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 1
+DATA_LOADER:
+  NUM_WORKERS: 4
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .
+#LOG_MODEL_INFO: False
+LOG_MODEL_INFO: True

TimeSformer/configs/SSv2/TimeSformer_divST_16_448.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+TRAIN:
+  ENABLE: True
+  DATASET: ssv2
+  BATCH_SIZE: 8
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/"
+  PATH_PREFIX: "/path/to/ssv2/frames/"
+  NUM_FRAMES: 16
+  TRAIN_JITTER_SCALES: [448, 512]
+  TRAIN_CROP_SIZE: 448
+  TEST_CROP_SIZE: 448
+  INPUT_CHANNEL_NUM: [3]
+  INV_UNIFORM_SAMPLE: True
+  RANDOM_FLIP: False
+  REVERSE_INPUT_CHANNEL: True
+TIMESFORMER:
+  ATTENTION_TYPE: 'divided_space_time'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 174
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: ssv2
+  BATCH_SIZE: 8
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 4
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/SSv2/TimeSformer_divST_64_224.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+TRAIN:
+  ENABLE: True
+  DATASET: ssv2
+  BATCH_SIZE: 8
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/"
+  PATH_PREFIX: "/path/to/ssv2/frames/"
+  NUM_FRAMES: 64
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+  INV_UNIFORM_SAMPLE: True
+  RANDOM_FLIP: False
+  REVERSE_INPUT_CHANNEL: True
+TIMESFORMER:
+  ATTENTION_TYPE: 'divided_space_time'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 174
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: ssv2
+  BATCH_SIZE: 8
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 4
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/configs/SSv2/TimeSformer_divST_8_224.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+TRAIN:
+  ENABLE: True
+  DATASET: ssv2
+  BATCH_SIZE: 8
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+DATA:
+  PATH_TO_DATA_DIR: " /path/to/ssv2/annotations/"
+  PATH_PREFIX: "/path/to/ssv2/frames/"
+  NUM_FRAMES: 8
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+  INV_UNIFORM_SAMPLE: True
+  RANDOM_FLIP: False
+  REVERSE_INPUT_CHANNEL: True
+TIMESFORMER:
+  ATTENTION_TYPE: 'divided_space_time'
+SOLVER:
+  BASE_LR: 0.005
+  LR_POLICY: steps_with_relative_lrs
+  STEPS: [0, 11, 14]
+  LRS: [1, 0.1, 0.01]
+  MAX_EPOCH: 15
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-4
+  OPTIMIZING_METHOD: sgd
+MODEL:
+  MODEL_NAME: vit_base_patch16_224
+  NUM_CLASSES: 174
+  ARCH: vit
+  LOSS_FUNC: cross_entropy
+  DROPOUT_RATE: 0.5
+TEST:
+  ENABLE: True
+  DATASET: ssv2
+  BATCH_SIZE: 8
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 4
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 1
+RNG_SEED: 0
+OUTPUT_DIR: .

TimeSformer/environment.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: timesformer
+channels:
+- pytorch
+- conda-forge
+- defaults
+dependencies:
+- python>3.7
+- jupyterlab
+- pandas>=1.2
+- numpy>1.19
+- pytorch>=1.6
+- torchvision>=0.7
+- scikit-learn>=0.22
+- opencv>=4.2
+- pyyaml>=5.1
+- yacs>=0.1.6
+- einops>=0.3
+- tensorboard
+- psutil
+- tqdm
+- matplotlib
+- simplejson
+- pip
+- pip:
+  - fvcore
+  - av

TimeSformer/example.ipynb ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "08fe0c59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "import torch\n",
+    "from timesformer.models.vit import TimeSformer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "10239d32",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_file = Path.home()/'TimeSformer/models/TimeSformer_divST_8x32_224_K600.pyth'\n",
+    "model_file.exists()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "652fb03e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = TimeSformer(img_size=224, num_classes=600, num_frames=8, attention_type='divided_space_time',  pretrained_model=str(model_file))\n",
+    "\n",
+    "dummy_video = torch.randn(2, 3, 8, 224, 224) # (batch x channels x frames x height x width)\n",
+    "\n",
+    "pred = model(dummy_video,) # (2, 600)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "83de13c5-791c-4db7-aba4-6d29ce88584e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert pred.shape == (2,600)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

TimeSformer/setup.cfg ADDED Viewed

	@@ -0,0 +1,23 @@

+[isort]
+line_length=100
+multi_line_output=4
+known_standard_library=numpy,setuptools
+known_myself=timesformer
+known_third_party=fvcore,av,torch,pycocotools,yacs,termcolor,scipy,simplejson,matplotlib,torchvision,yaml,tqdm,psutil,opencv-python,pandas,tensorboard,moviepy,sklearn,cv2
+no_lines_before=STDLIB,THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
+default_section=FIRSTPARTY
+[mypy]
+python_version=3.6
+ignore_missing_imports = True
+warn_unused_configs = True
+disallow_untyped_defs = True
+check_untyped_defs = True
+warn_unused_ignores = True
+warn_redundant_casts = True
+show_column_numbers = True
+follow_imports = silent
+allow_redefinition = True
+; Require all functions to be annotated
+disallow_incomplete_defs = True

TimeSformer/setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from setuptools import find_packages, setup
+setup(
+    name="timesformer",
+    version="1.0",
+    author="FBAI",
+    url="unknown",
+    description="TimeSformer",
+    keywords = [
+    'artificial intelligence',
+    'attention mechanism',
+    'transformers',
+    'video classification',
+    ],
+    install_requires=[
+        'einops>=0.3',
+        'torch>=1.6'
+    ],
+    extras_require={"tensorboard_video_visualization": ["moviepy"]},
+    packages=find_packages(exclude=("configs", "tests")),
+)

TimeSformer/slurm_scripts/run_multi_node_job.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# A script with a list of commands for submitting SLURM jobs
+#### Kinetics training
+JOB_NAME=TimeSformer_divST_8x32_224
+python tools/submit.py --cfg configs/Kinetics/TimeSformer_divST_8x32_224.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition dev --comment "" --name ${JOB_NAME} --use_volta32
+#JOB_NAME=TimeSformer_jointST_8x32_224
+#python tools/submit.py --cfg configs/Kinetics/TimeSformer_jointST_8x32_224.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32
+#JOB_NAME=TimeSformer_spaceOnly_8x32_224
+#python tools/submit.py --cfg configs/Kinetics/TimeSformer_spaceOnly_8x32_224.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32
+#### Kinetics inference
+#JOB_NAME=TimeSformer_divST_8x32_224_TEST_3clips
+#python tools/submit.py --cfg configs/Kinetics/TimeSformer_divST_8x32_224_TEST.yaml --job_dir /your/job/dir/${JOB_NAME}/  --num_shards 4 --partition dev --comment "" --name ${JOB_NAME} --use_volta32
+##### SSv2 training
+#JOB_NAME=TimeSformer_divST_8_224
+#python tools/submit.py --cfg configs/SSv2/TimeSformer_divST_8_224.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32
+##### Sth-Sth_v2 inference
+#JOB_NAME=TimeSformer_divST_8_224_TEST_3clips
+#python tools/submit.py --cfg configs/SSv2/TimeSformer_divST_8_224_TEST.yaml --job_dir  /your/job/dir/${JOB_NAME}/   --num_shards 4 --partition learnfair --comment "" --name ${JOB_NAME} --use_volta32

TimeSformer/slurm_scripts/run_single_node_job.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# A script with a list of commands for submitting SLURM jobs
+#SBATCH --job-name=timesformer
+#SBATCH --mail-type=END,FAIL,REQUEUE
+#SBATCH --mail-user=name@domain.com
+## %j is the job id, %u is the user id
+#SBATCH --output=/path/to/output/logs/slog-%A-%a.out
+## filename for job standard error output (stderr)
+#SBATCH --error=/path/to/error/logs/slog-%A-%a.err
+#SBATCH --array=1
+#SBATCH --partition=partition_of_your_choice
+#SBATCH --nodes=1 -C volta32gb
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus-per-node=8
+#SBATCH --cpus-per-task=80
+#SBATCH --mem=480GB
+#SBATCH --signal=USR1@600
+#SBATCH --time=72:00:00
+#SBATCH --open-mode=append
+module purge
+module load cuda/10.0
+module load NCCL/2.4.7-1-cuda.10.0
+module load cudnn/v7.4-cuda.10.0
+source activate timesformer
+WORKINGDIR=/path/to/TimeSformer
+CURPYTHON=/path/to/python
+srun --label ${CURPYTHON} ${WORKINGDIR}/tools/run_net.py --cfg ${WORKINGDIR}/configs/Kinetics/TimeSformer_divST_8x32_224.yaml NUM_GPUS 8 TRAIN.BATCH_SIZE 8

TimeSformer/timesformer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from timesformer.utils.env import setup_environment
+setup_environment()

TimeSformer/timesformer/config/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

TimeSformer/timesformer/config/defaults.py ADDED Viewed

	@@ -0,0 +1,820 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Configs."""
+from fvcore.common.config import CfgNode
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+_C = CfgNode()
+# ---------------------------------------------------------------------------- #
+# Batch norm options
+# ---------------------------------------------------------------------------- #
+_C.BN = CfgNode()
+# Precise BN stats.
+_C.BN.USE_PRECISE_STATS = False
+# Number of samples use to compute precise bn.
+_C.BN.NUM_BATCHES_PRECISE = 200
+# Weight decay value that applies on BN.
+_C.BN.WEIGHT_DECAY = 0.0
+# Norm type, options include `batchnorm`, `sub_batchnorm`, `sync_batchnorm`
+_C.BN.NORM_TYPE = "batchnorm"
+# Parameter for SubBatchNorm, where it splits the batch dimension into
+# NUM_SPLITS splits, and run BN on each of them separately independently.
+_C.BN.NUM_SPLITS = 1
+# Parameter for NaiveSyncBatchNorm3d, where the stats across `NUM_SYNC_DEVICES`
+# devices will be synchronized.
+_C.BN.NUM_SYNC_DEVICES = 1
+# ---------------------------------------------------------------------------- #
+# Training options.
+# ---------------------------------------------------------------------------- #
+_C.TRAIN = CfgNode()
+# If True Train the model, else skip training.
+_C.TRAIN.ENABLE = True
+# Dataset.
+_C.TRAIN.DATASET = "kinetics"
+##
+_C.TRAIN.FINETUNE = False
+# Total mini-batch size.
+_C.TRAIN.BATCH_SIZE = 64
+# Evaluate model on test data every eval period epochs.
+_C.TRAIN.EVAL_PERIOD = 10
+# Save model checkpoint every checkpoint period epochs.
+_C.TRAIN.CHECKPOINT_PERIOD = 10
+# Resume training from the latest checkpoint in the output directory.
+_C.TRAIN.AUTO_RESUME = True
+# Path to the checkpoint to load the initial weight.
+_C.TRAIN.CHECKPOINT_FILE_PATH = ""
+# Checkpoint types include `caffe2` or `pytorch`.
+_C.TRAIN.CHECKPOINT_TYPE = "pytorch"
+# If True, perform inflation when loading checkpoint.
+_C.TRAIN.CHECKPOINT_INFLATE = False
+# If True, reset epochs when loading checkpoint.
+_C.TRAIN.CHECKPOINT_EPOCH_RESET = False
+# If set, clear all layer names according to the pattern provided.
+_C.TRAIN.CHECKPOINT_CLEAR_NAME_PATTERN = ()  # ("backbone.",)
+# ---------------------------------------------------------------------------- #
+# Testing options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CfgNode()
+# If True test the model, else skip the testing.
+_C.TEST.ENABLE = True
+# Dataset for testing.
+_C.TEST.DATASET = "kinetics"
+# Total mini-batch size
+_C.TEST.BATCH_SIZE = 8
+# Path to the checkpoint to load the initial weight.
+_C.TEST.CHECKPOINT_FILE_PATH = ""
+# Number of clips to sample from a video uniformly for aggregating the
+# prediction results.
+_C.TEST.NUM_ENSEMBLE_VIEWS = 10
+# Number of crops to sample from a frame spatially for aggregating the
+# prediction results.
+_C.TEST.NUM_SPATIAL_CROPS = 3
+# Checkpoint types include `caffe2` or `pytorch`.
+_C.TEST.CHECKPOINT_TYPE = "pytorch"
+# Path to saving prediction results file.
+_C.TEST.SAVE_RESULTS_PATH = ""
+# -----------------------------------------------------------------------------
+# ResNet options
+# -----------------------------------------------------------------------------
+_C.RESNET = CfgNode()
+# Transformation function.
+_C.RESNET.TRANS_FUNC = "bottleneck_transform"
+# Number of groups. 1 for ResNet, and larger than 1 for ResNeXt).
+_C.RESNET.NUM_GROUPS = 1
+# Width of each group (64 -> ResNet; 4 -> ResNeXt).
+_C.RESNET.WIDTH_PER_GROUP = 64
+# Apply relu in a inplace manner.
+_C.RESNET.INPLACE_RELU = True
+# Apply stride to 1x1 conv.
+_C.RESNET.STRIDE_1X1 = False
+#  If true, initialize the gamma of the final BN of each block to zero.
+_C.RESNET.ZERO_INIT_FINAL_BN = False
+# Number of weight layers.
+_C.RESNET.DEPTH = 50
+# If the current block has more than NUM_BLOCK_TEMP_KERNEL blocks, use temporal
+# kernel of 1 for the rest of the blocks.
+_C.RESNET.NUM_BLOCK_TEMP_KERNEL = [[3], [4], [6], [3]]
+# Size of stride on different res stages.
+_C.RESNET.SPATIAL_STRIDES = [[1], [2], [2], [2]]
+# Size of dilation on different res stages.
+_C.RESNET.SPATIAL_DILATIONS = [[1], [1], [1], [1]]
+# ---------------------------------------------------------------------------- #
+# X3D  options
+# See https://arxiv.org/abs/2004.04730 for details about X3D Networks.
+# ---------------------------------------------------------------------------- #
+_C.X3D = CfgNode()
+# Width expansion factor.
+_C.X3D.WIDTH_FACTOR = 1.0
+# Depth expansion factor.
+_C.X3D.DEPTH_FACTOR = 1.0
+# Bottleneck expansion factor for the 3x3x3 conv.
+_C.X3D.BOTTLENECK_FACTOR = 1.0  #
+# Dimensions of the last linear layer before classificaiton.
+_C.X3D.DIM_C5 = 2048
+# Dimensions of the first 3x3 conv layer.
+_C.X3D.DIM_C1 = 12
+# Whether to scale the width of Res2, default is false.
+_C.X3D.SCALE_RES2 = False
+# Whether to use a BatchNorm (BN) layer before the classifier, default is false.
+_C.X3D.BN_LIN5 = False
+# Whether to use channelwise (=depthwise) convolution in the center (3x3x3)
+# convolution operation of the residual blocks.
+_C.X3D.CHANNELWISE_3x3x3 = True
+# -----------------------------------------------------------------------------
+# Nonlocal options
+# -----------------------------------------------------------------------------
+_C.NONLOCAL = CfgNode()
+# Index of each stage and block to add nonlocal layers.
+_C.NONLOCAL.LOCATION = [[[]], [[]], [[]], [[]]]
+# Number of group for nonlocal for each stage.
+_C.NONLOCAL.GROUP = [[1], [1], [1], [1]]
+# Instatiation to use for non-local layer.
+_C.NONLOCAL.INSTANTIATION = "dot_product"
+# Size of pooling layers used in Non-Local.
+_C.NONLOCAL.POOL = [
+    # Res2
+    [[1, 2, 2], [1, 2, 2]],
+    # Res3
+    [[1, 2, 2], [1, 2, 2]],
+    # Res4
+    [[1, 2, 2], [1, 2, 2]],
+    # Res5
+    [[1, 2, 2], [1, 2, 2]],
+]
+# -----------------------------------------------------------------------------
+# Model options
+# -----------------------------------------------------------------------------
+_C.MODEL = CfgNode()
+# Model architecture.
+_C.MODEL.ARCH = "slowfast"
+# Model name
+_C.MODEL.MODEL_NAME = "SlowFast"
+# The number of classes to predict for the model.
+_C.MODEL.NUM_CLASSES = 400
+# Loss function.
+_C.MODEL.LOSS_FUNC = "cross_entropy"
+# Model architectures that has one single pathway.
+_C.MODEL.SINGLE_PATHWAY_ARCH = ["c2d", "i3d", "slow", "x3d"]
+# Model architectures that has multiple pathways.
+_C.MODEL.MULTI_PATHWAY_ARCH = ["slowfast"]
+# Dropout rate before final projection in the backbone.
+_C.MODEL.DROPOUT_RATE = 0.5
+# Randomly drop rate for Res-blocks, linearly increase from res2 to res5
+_C.MODEL.DROPCONNECT_RATE = 0.0
+# The std to initialize the fc layer(s).
+_C.MODEL.FC_INIT_STD = 0.01
+# Activation layer for the output head.
+_C.MODEL.HEAD_ACT = "softmax"
+# -----------------------------------------------------------------------------
+# SlowFast options
+# -----------------------------------------------------------------------------
+_C.SLOWFAST = CfgNode()
+# Corresponds to the inverse of the channel reduction ratio, $\beta$ between
+# the Slow and Fast pathways.
+_C.SLOWFAST.BETA_INV = 8
+# Corresponds to the frame rate reduction ratio, $\alpha$ between the Slow and
+# Fast pathways.
+_C.SLOWFAST.ALPHA = 8
+# Ratio of channel dimensions between the Slow and Fast pathways.
+_C.SLOWFAST.FUSION_CONV_CHANNEL_RATIO = 2
+# Kernel dimension used for fusing information from Fast pathway to Slow
+# pathway.
+_C.SLOWFAST.FUSION_KERNEL_SZ = 5
+####### TimeSformer Options
+_C.TIMESFORMER = CfgNode()
+_C.TIMESFORMER.ATTENTION_TYPE = 'divided_space_time'
+_C.TIMESFORMER.PRETRAINED_MODEL = ''
+## MixUp parameters
+_C.MIXUP = CfgNode()
+_C.MIXUP.ENABLED = False
+_C.MIXUP.ALPHA = 0.8
+_C.MIXUP.CUTMIX_ALPHA = 1.0
+_C.MIXUP.CUTMIX_MINMAX = None
+_C.MIXUP.PROB = 1.0
+_C.MIXUP.SWITCH_PROB = 0.5
+_C.MIXUP.MODE = 'batch'
+_C.EMA = CfgNode()
+_C.EMA.ENABLED = False
+# -----------------------------------------------------------------------------
+# Data options
+# -----------------------------------------------------------------------------
+_C.DATA = CfgNode()
+# The path to the data directory.
+_C.DATA.PATH_TO_DATA_DIR = ""
+# The separator used between path and label.
+_C.DATA.PATH_LABEL_SEPARATOR = " "
+# Video path prefix if any.
+_C.DATA.PATH_PREFIX = ""
+# The spatial crop size of the input clip.
+_C.DATA.CROP_SIZE = 224
+# The number of frames of the input clip.
+_C.DATA.NUM_FRAMES = 8
+# The video sampling rate of the input clip.
+_C.DATA.SAMPLING_RATE = 8
+# The mean value of the video raw pixels across the R G B channels.
+_C.DATA.MEAN = [0.45, 0.45, 0.45]
+# List of input frame channel dimensions.
+_C.DATA.INPUT_CHANNEL_NUM = [3, 3]
+# The std value of the video raw pixels across the R G B channels.
+_C.DATA.STD = [0.225, 0.225, 0.225]
+# The spatial augmentation jitter scales for training.
+_C.DATA.TRAIN_JITTER_SCALES = [256, 320]
+# The spatial crop size for training.
+_C.DATA.TRAIN_CROP_SIZE = 224
+# The spatial crop size for testing.
+_C.DATA.TEST_CROP_SIZE = 256
+# Input videos may has different fps, convert it to the target video fps before
+# frame sampling.
+_C.DATA.TARGET_FPS = 30
+# Decoding backend, options include `pyav` or `torchvision`
+_C.DATA.DECODING_BACKEND = "pyav"
+# if True, sample uniformly in [1 / max_scale, 1 / min_scale] and take a
+# reciprocal to get the scale. If False, take a uniform sample from
+# [min_scale, max_scale].
+_C.DATA.INV_UNIFORM_SAMPLE = False
+# If True, perform random horizontal flip on the video frames during training.
+_C.DATA.RANDOM_FLIP = True
+# If True, calculdate the map as metric.
+_C.DATA.MULTI_LABEL = False
+# Method to perform the ensemble, options include "sum" and "max".
+_C.DATA.ENSEMBLE_METHOD = "sum"
+# If True, revert the default input channel (RBG <-> BGR).
+_C.DATA.REVERSE_INPUT_CHANNEL = False
+############
+_C.DATA.TEMPORAL_EXTENT = 8
+_C.DATA.DEIT_TRANSFORMS = False
+_C.DATA.COLOR_JITTER = 0.
+_C.DATA.AUTO_AUGMENT = ''
+_C.DATA.RE_PROB = 0.0
+# ---------------------------------------------------------------------------- #
+# Optimizer options
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CfgNode()
+# Base learning rate.
+_C.SOLVER.BASE_LR = 0.1
+# Learning rate policy (see utils/lr_policy.py for options and examples).
+_C.SOLVER.LR_POLICY = "cosine"
+# Final learning rates for 'cosine' policy.
+_C.SOLVER.COSINE_END_LR = 0.0
+# Exponential decay factor.
+_C.SOLVER.GAMMA = 0.1
+# Step size for 'exp' and 'cos' policies (in epochs).
+_C.SOLVER.STEP_SIZE = 1
+# Steps for 'steps_' policies (in epochs).
+_C.SOLVER.STEPS = []
+# Learning rates for 'steps_' policies.
+_C.SOLVER.LRS = []
+# Maximal number of epochs.
+_C.SOLVER.MAX_EPOCH = 300
+# Momentum.
+_C.SOLVER.MOMENTUM = 0.9
+# Momentum dampening.
+_C.SOLVER.DAMPENING = 0.0
+# Nesterov momentum.
+_C.SOLVER.NESTEROV = True
+# L2 regularization.
+_C.SOLVER.WEIGHT_DECAY = 1e-4
+# Start the warm up from SOLVER.BASE_LR * SOLVER.WARMUP_FACTOR.
+_C.SOLVER.WARMUP_FACTOR = 0.1
+# Gradually warm up the SOLVER.BASE_LR over this number of epochs.
+_C.SOLVER.WARMUP_EPOCHS = 0.0
+# The start learning rate of the warm up.
+_C.SOLVER.WARMUP_START_LR = 0.01
+# Optimization method.
+_C.SOLVER.OPTIMIZING_METHOD = "sgd"
+# Base learning rate is linearly scaled with NUM_SHARDS.
+_C.SOLVER.BASE_LR_SCALE_NUM_SHARDS = False
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+# Number of GPUs to use (applies to both training and testing).
+_C.NUM_GPUS = 1
+# Number of machine to use for the job.
+_C.NUM_SHARDS = 1
+# The index of the current machine.
+_C.SHARD_ID = 0
+# Output basedir.
+_C.OUTPUT_DIR = "./tmp"
+# Note that non-determinism may still be present due to non-deterministic
+# operator implementations in GPU operator libraries.
+_C.RNG_SEED = 1
+# Log period in iters.
+_C.LOG_PERIOD = 10
+# If True, log the model info.
+_C.LOG_MODEL_INFO = False
+# Distributed backend.
+_C.DIST_BACKEND = "nccl"
+# Global batch size
+_C.GLOBAL_BATCH_SIZE = 64
+# ---------------------------------------------------------------------------- #
+# Benchmark options
+# ---------------------------------------------------------------------------- #
+_C.BENCHMARK = CfgNode()
+# Number of epochs for data loading benchmark.
+_C.BENCHMARK.NUM_EPOCHS = 5
+# Log period in iters for data loading benchmark.
+_C.BENCHMARK.LOG_PERIOD = 100
+# If True, shuffle dataloader for epoch during benchmark.
+_C.BENCHMARK.SHUFFLE = True
+# ---------------------------------------------------------------------------- #
+# Common train/test data loader options
+# ---------------------------------------------------------------------------- #
+_C.DATA_LOADER = CfgNode()
+# Number of data loader workers per training process.
+_C.DATA_LOADER.NUM_WORKERS = 8
+# Load data to pinned host memory.
+_C.DATA_LOADER.PIN_MEMORY = True
+# Enable multi thread decoding.
+_C.DATA_LOADER.ENABLE_MULTI_THREAD_DECODE = False
+# ---------------------------------------------------------------------------- #
+# Detection options.
+# ---------------------------------------------------------------------------- #
+_C.DETECTION = CfgNode()
+# Whether enable video detection.
+_C.DETECTION.ENABLE = False
+# Aligned version of RoI. More details can be found at slowfast/models/head_helper.py
+_C.DETECTION.ALIGNED = True
+# Spatial scale factor.
+_C.DETECTION.SPATIAL_SCALE_FACTOR = 16
+# RoI tranformation resolution.
+_C.DETECTION.ROI_XFORM_RESOLUTION = 7
+# -----------------------------------------------------------------------------
+# AVA Dataset options
+# -----------------------------------------------------------------------------
+_C.AVA = CfgNode()
+# Directory path of frames.
+_C.AVA.FRAME_DIR = "/mnt/fair-flash3-east/ava_trainval_frames.img/"
+# Directory path for files of frame lists.
+_C.AVA.FRAME_LIST_DIR = (
+    "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/"
+)
+# Directory path for annotation files.
+_C.AVA.ANNOTATION_DIR = (
+    "/mnt/vol/gfsai-flash3-east/ai-group/users/haoqifan/ava/frame_list/"
+)
+# Filenames of training samples list files.
+_C.AVA.TRAIN_LISTS = ["train.csv"]
+# Filenames of test samples list files.
+_C.AVA.TEST_LISTS = ["val.csv"]
+# Filenames of box list files for training. Note that we assume files which
+# contains predicted boxes will have a suffix "predicted_boxes" in the
+# filename.
+_C.AVA.TRAIN_GT_BOX_LISTS = ["ava_train_v2.2.csv"]
+_C.AVA.TRAIN_PREDICT_BOX_LISTS = []
+# Filenames of box list files for test.
+_C.AVA.TEST_PREDICT_BOX_LISTS = ["ava_val_predicted_boxes.csv"]
+# This option controls the score threshold for the predicted boxes to use.
+_C.AVA.DETECTION_SCORE_THRESH = 0.9
+# If use BGR as the format of input frames.
+_C.AVA.BGR = False
+# Training augmentation parameters
+# Whether to use color augmentation method.
+_C.AVA.TRAIN_USE_COLOR_AUGMENTATION = False
+# Whether to only use PCA jitter augmentation when using color augmentation
+# method (otherwise combine with color jitter method).
+_C.AVA.TRAIN_PCA_JITTER_ONLY = True
+# Eigenvalues for PCA jittering. Note PCA is RGB based.
+_C.AVA.TRAIN_PCA_EIGVAL = [0.225, 0.224, 0.229]
+# Eigenvectors for PCA jittering.
+_C.AVA.TRAIN_PCA_EIGVEC = [
+    [-0.5675, 0.7192, 0.4009],
+    [-0.5808, -0.0045, -0.8140],
+    [-0.5836, -0.6948, 0.4203],
+]
+# Whether to do horizontal flipping during test.
+_C.AVA.TEST_FORCE_FLIP = False
+# Whether to use full test set for validation split.
+_C.AVA.FULL_TEST_ON_VAL = False
+# The name of the file to the ava label map.
+_C.AVA.LABEL_MAP_FILE = "ava_action_list_v2.2_for_activitynet_2019.pbtxt"
+# The name of the file to the ava exclusion.
+_C.AVA.EXCLUSION_FILE = "ava_val_excluded_timestamps_v2.2.csv"
+# The name of the file to the ava groundtruth.
+_C.AVA.GROUNDTRUTH_FILE = "ava_val_v2.2.csv"
+# Backend to process image, includes `pytorch` and `cv2`.
+_C.AVA.IMG_PROC_BACKEND = "cv2"
+# ---------------------------------------------------------------------------- #
+# Multigrid training options
+# See https://arxiv.org/abs/1912.00998 for details about multigrid training.
+# ---------------------------------------------------------------------------- #
+_C.MULTIGRID = CfgNode()
+# Multigrid training allows us to train for more epochs with fewer iterations.
+# This hyperparameter specifies how many times more epochs to train.
+# The default setting in paper trains for 1.5x more epochs than baseline.
+_C.MULTIGRID.EPOCH_FACTOR = 1.5
+# Enable short cycles.
+_C.MULTIGRID.SHORT_CYCLE = False
+# Short cycle additional spatial dimensions relative to the default crop size.
+_C.MULTIGRID.SHORT_CYCLE_FACTORS = [0.5, 0.5 ** 0.5]
+_C.MULTIGRID.LONG_CYCLE = False
+# (Temporal, Spatial) dimensions relative to the default shape.
+_C.MULTIGRID.LONG_CYCLE_FACTORS = [
+    (0.25, 0.5 ** 0.5),
+    (0.5, 0.5 ** 0.5),
+    (0.5, 1),
+    (1, 1),
+]
+# While a standard BN computes stats across all examples in a GPU,
+# for multigrid training we fix the number of clips to compute BN stats on.
+# See https://arxiv.org/abs/1912.00998 for details.
+_C.MULTIGRID.BN_BASE_SIZE = 8
+# Multigrid training epochs are not proportional to actual training time or
+# computations, so _C.TRAIN.EVAL_PERIOD leads to too frequent or rare
+# evaluation. We use a multigrid-specific rule to determine when to evaluate:
+# This hyperparameter defines how many times to evaluate a model per long
+# cycle shape.
+_C.MULTIGRID.EVAL_FREQ = 3
+# No need to specify; Set automatically and used as global variables.
+_C.MULTIGRID.LONG_CYCLE_SAMPLING_RATE = 0
+_C.MULTIGRID.DEFAULT_B = 0
+_C.MULTIGRID.DEFAULT_T = 0
+_C.MULTIGRID.DEFAULT_S = 0
+# -----------------------------------------------------------------------------
+# Tensorboard Visualization Options
+# -----------------------------------------------------------------------------
+_C.TENSORBOARD = CfgNode()
+# Log to summary writer, this will automatically.
+# log loss, lr and metrics during train/eval.
+_C.TENSORBOARD.ENABLE = False
+# Provide path to prediction results for visualization.
+# This is a pickle file of [prediction_tensor, label_tensor]
+_C.TENSORBOARD.PREDICTIONS_PATH = ""
+# Path to directory for tensorboard logs.
+# Default to to cfg.OUTPUT_DIR/runs-{cfg.TRAIN.DATASET}.
+_C.TENSORBOARD.LOG_DIR = ""
+# Path to a json file providing class_name - id mapping
+# in the format {"class_name1": id1, "class_name2": id2, ...}.
+# This file must be provided to enable plotting confusion matrix
+# by a subset or parent categories.
+_C.TENSORBOARD.CLASS_NAMES_PATH = ""
+# Path to a json file for categories -> classes mapping
+# in the format {"parent_class": ["child_class1", "child_class2",...], ...}.
+_C.TENSORBOARD.CATEGORIES_PATH = ""
+# Config for confusion matrices visualization.
+_C.TENSORBOARD.CONFUSION_MATRIX = CfgNode()
+# Visualize confusion matrix.
+_C.TENSORBOARD.CONFUSION_MATRIX.ENABLE = False
+# Figure size of the confusion matrices plotted.
+_C.TENSORBOARD.CONFUSION_MATRIX.FIGSIZE = [8, 8]
+# Path to a subset of categories to visualize.
+# File contains class names separated by newline characters.
+_C.TENSORBOARD.CONFUSION_MATRIX.SUBSET_PATH = ""
+# Config for histogram visualization.
+_C.TENSORBOARD.HISTOGRAM = CfgNode()
+# Visualize histograms.
+_C.TENSORBOARD.HISTOGRAM.ENABLE = False
+# Path to a subset of classes to plot histograms.
+# Class names must be separated by newline characters.
+_C.TENSORBOARD.HISTOGRAM.SUBSET_PATH = ""
+# Visualize top-k most predicted classes on histograms for each
+# chosen true label.
+_C.TENSORBOARD.HISTOGRAM.TOPK = 10
+# Figure size of the histograms plotted.
+_C.TENSORBOARD.HISTOGRAM.FIGSIZE = [8, 8]
+# Config for layers' weights and activations visualization.
+# _C.TENSORBOARD.ENABLE must be True.
+_C.TENSORBOARD.MODEL_VIS = CfgNode()
+# If False, skip model visualization.
+_C.TENSORBOARD.MODEL_VIS.ENABLE = False
+# If False, skip visualizing model weights.
+_C.TENSORBOARD.MODEL_VIS.MODEL_WEIGHTS = False
+# If False, skip visualizing model activations.
+_C.TENSORBOARD.MODEL_VIS.ACTIVATIONS = False
+# If False, skip visualizing input videos.
+_C.TENSORBOARD.MODEL_VIS.INPUT_VIDEO = False
+# List of strings containing data about layer names and their indexing to
+# visualize weights and activations for. The indexing is meant for
+# choosing a subset of activations outputed by a layer for visualization.
+# If indexing is not specified, visualize all activations outputed by the layer.
+# For each string, layer name and indexing is separated by whitespaces.
+# e.g.: [layer1 1,2;1,2, layer2, layer3 150,151;3,4]; this means for each array `arr`
+# along the batch dimension in `layer1`, we take arr[[1, 2], [1, 2]]
+_C.TENSORBOARD.MODEL_VIS.LAYER_LIST = []
+# Top-k predictions to plot on videos
+_C.TENSORBOARD.MODEL_VIS.TOPK_PREDS = 1
+# Colormap to for text boxes and bounding boxes colors
+_C.TENSORBOARD.MODEL_VIS.COLORMAP = "Pastel2"
+# Config for visualization video inputs with Grad-CAM.
+# _C.TENSORBOARD.ENABLE must be True.
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM = CfgNode()
+# Whether to run visualization using Grad-CAM technique.
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.ENABLE = True
+# CNN layers to use for Grad-CAM. The number of layers must be equal to
+# number of pathway(s).
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.LAYER_LIST = []
+# If True, visualize Grad-CAM using true labels for each instances.
+# If False, use the highest predicted class.
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.USE_TRUE_LABEL = False
+# Colormap to for text boxes and bounding boxes colors
+_C.TENSORBOARD.MODEL_VIS.GRAD_CAM.COLORMAP = "viridis"
+# Config for visualization for wrong prediction visualization.
+# _C.TENSORBOARD.ENABLE must be True.
+_C.TENSORBOARD.WRONG_PRED_VIS = CfgNode()
+_C.TENSORBOARD.WRONG_PRED_VIS.ENABLE = False
+# Folder tag to origanize model eval videos under.
+_C.TENSORBOARD.WRONG_PRED_VIS.TAG = "Incorrectly classified videos."
+# Subset of labels to visualize. Only wrong predictions with true labels
+# within this subset is visualized.
+_C.TENSORBOARD.WRONG_PRED_VIS.SUBSET_PATH = ""
+# ---------------------------------------------------------------------------- #
+# Demo options
+# ---------------------------------------------------------------------------- #
+_C.DEMO = CfgNode()
+# Run model in DEMO mode.
+_C.DEMO.ENABLE = False
+# Path to a json file providing class_name - id mapping
+# in the format {"class_name1": id1, "class_name2": id2, ...}.
+_C.DEMO.LABEL_FILE_PATH = ""
+# Specify a camera device as input. This will be prioritized
+# over input video if set.
+# If -1, use input video instead.
+_C.DEMO.WEBCAM = -1
+# Path to input video for demo.
+_C.DEMO.INPUT_VIDEO = ""
+# Custom width for reading input video data.
+_C.DEMO.DISPLAY_WIDTH = 0
+# Custom height for reading input video data.
+_C.DEMO.DISPLAY_HEIGHT = 0
+# Path to Detectron2 object detection model configuration,
+# only used for detection tasks.
+_C.DEMO.DETECTRON2_CFG = "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
+# Path to Detectron2 object detection model pre-trained weights.
+_C.DEMO.DETECTRON2_WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl"
+# Threshold for choosing predicted bounding boxes by Detectron2.
+_C.DEMO.DETECTRON2_THRESH = 0.9
+# Number of overlapping frames between 2 consecutive clips.
+# Increase this number for more frequent action predictions.
+# The number of overlapping frames cannot be larger than
+# half of the sequence length `cfg.DATA.NUM_FRAMES * cfg.DATA.SAMPLING_RATE`
+_C.DEMO.BUFFER_SIZE = 0
+# If specified, the visualized outputs will be written this a video file of
+# this path. Otherwise, the visualized outputs will be displayed in a window.
+_C.DEMO.OUTPUT_FILE = ""
+# Frames per second rate for writing to output video file.
+# If not set (-1), use fps rate from input file.
+_C.DEMO.OUTPUT_FPS = -1
+# Input format from demo video reader ("RGB" or "BGR").
+_C.DEMO.INPUT_FORMAT = "BGR"
+# Draw visualization frames in [keyframe_idx - CLIP_VIS_SIZE, keyframe_idx + CLIP_VIS_SIZE] inclusively.
+_C.DEMO.CLIP_VIS_SIZE = 10
+# Number of processes to run video visualizer.
+_C.DEMO.NUM_VIS_INSTANCES = 2
+# Path to pre-computed predicted boxes
+_C.DEMO.PREDS_BOXES = ""
+# Whether to run in with multi-threaded video reader.
+_C.DEMO.THREAD_ENABLE = False
+# Take one clip for every `DEMO.NUM_CLIPS_SKIP` + 1 for prediction and visualization.
+# This is used for fast demo speed by reducing the prediction/visualiztion frequency.
+# If -1, take the most recent read clip for visualization. This mode is only supported
+# if `DEMO.THREAD_ENABLE` is set to True.
+_C.DEMO.NUM_CLIPS_SKIP = 0
+# Path to ground-truth boxes and labels (optional)
+_C.DEMO.GT_BOXES = ""
+# The starting second of the video w.r.t bounding boxes file.
+_C.DEMO.STARTING_SECOND = 900
+# Frames per second of the input video/folder of images.
+_C.DEMO.FPS = 30
+# Visualize with top-k predictions or predictions above certain threshold(s).
+# Option: {"thres", "top-k"}
+_C.DEMO.VIS_MODE = "thres"
+# Threshold for common class names.
+_C.DEMO.COMMON_CLASS_THRES = 0.7
+# Theshold for uncommon class names. This will not be
+# used if `_C.DEMO.COMMON_CLASS_NAMES` is empty.
+_C.DEMO.UNCOMMON_CLASS_THRES = 0.3
+# This is chosen based on distribution of examples in
+# each classes in AVA dataset.
+_C.DEMO.COMMON_CLASS_NAMES = [
+    "watch (a person)",
+    "talk to (e.g., self, a person, a group)",
+    "listen to (a person)",
+    "touch (an object)",
+    "carry/hold (an object)",
+    "walk",
+    "sit",
+    "lie/sleep",
+    "bend/bow (at the waist)",
+]
+# Slow-motion rate for the visualization. The visualized portions of the
+# video will be played `_C.DEMO.SLOWMO` times slower than usual speed.
+_C.DEMO.SLOWMO = 1
+def _assert_and_infer_cfg(cfg):
+    # BN assertions.
+    if cfg.BN.USE_PRECISE_STATS:
+        assert cfg.BN.NUM_BATCHES_PRECISE >= 0
+    # TRAIN assertions.
+    assert cfg.TRAIN.CHECKPOINT_TYPE in ["pytorch", "caffe2"]
+    assert cfg.TRAIN.BATCH_SIZE % cfg.NUM_GPUS == 0
+    # TEST assertions.
+    assert cfg.TEST.CHECKPOINT_TYPE in ["pytorch", "caffe2"]
+    assert cfg.TEST.BATCH_SIZE % cfg.NUM_GPUS == 0
+    assert cfg.TEST.NUM_SPATIAL_CROPS == 3
+    # RESNET assertions.
+    assert cfg.RESNET.NUM_GROUPS > 0
+    assert cfg.RESNET.WIDTH_PER_GROUP > 0
+    assert cfg.RESNET.WIDTH_PER_GROUP % cfg.RESNET.NUM_GROUPS == 0
+    # Execute LR scaling by num_shards.
+    if cfg.SOLVER.BASE_LR_SCALE_NUM_SHARDS:
+        cfg.SOLVER.BASE_LR *= cfg.NUM_SHARDS
+    # General assertions.
+    assert cfg.SHARD_ID < cfg.NUM_SHARDS
+    return cfg
+def get_cfg():
+    """
+    Get a copy of the default config.
+    """
+    return _assert_and_infer_cfg(_C.clone())

TimeSformer/timesformer/datasets/DATASET.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Dataset Preparation
+## Kinetics
+The Kinetics Dataset could be downloaded from the following [link](https://github.com/cvdfoundation/kinetics-dataset):
+After all the videos were downloaded, resize the video to the short edge size of 256, then prepare the csv files for training, validation, and testing set as `train.csv`, `val.csv`, `test.csv`. The format of the csv file is:
+```
+path_to_video_1 label_1
+path_to_video_2 label_2
+path_to_video_3 label_3
+...
+path_to_video_N label_N
+```
+## Something-Something V2
+1. Please download the dataset and annotations from [dataset provider](https://20bn.com/datasets/something-something).
+2. Download the *frame list* from the following links: ([train](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/train.csv), [val](https://dl.fbaipublicfiles.com/pyslowfast/dataset/ssv2/frame_lists/val.csv)).
+3. Extract the frames at 30 FPS. (We used ffmpeg-4.1.3 with command
+`ffmpeg -i "${video}" -r 30 -q:v 1 "${out_name}"`
+   in experiments.) Please put the frames in a structure consistent with the frame lists.
+Please put all annotation json files and the frame lists in the same folder, and set `DATA.PATH_TO_DATA_DIR` to the path. Set `DATA.PATH_PREFIX` to be the path to the folder containing extracted frames.

TimeSformer/timesformer/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .build import DATASET_REGISTRY, build_dataset  # noqa
+from .kinetics import Kinetics  # noqa
+from .ssv2 import Ssv2  # noqa

TimeSformer/timesformer/datasets/build.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from fvcore.common.registry import Registry
+DATASET_REGISTRY = Registry("DATASET")
+DATASET_REGISTRY.__doc__ = """
+Registry for dataset.
+The registered object will be called with `obj(cfg, split)`.
+The call should return a `torch.utils.data.Dataset` object.
+"""
+def build_dataset(dataset_name, cfg, split):
+    """
+    Build a dataset, defined by `dataset_name`.
+    Args:
+        dataset_name (str): the name of the dataset to be constructed.
+        cfg (CfgNode): configs. Details can be found in
+            slowfast/config/defaults.py
+        split (str): the split of the data loader. Options include `train`,
+            `val`, and `test`.
+    Returns:
+        Dataset: a constructed dataset specified by dataset_name.
+    """
+    # Capitalize the the first letter of the dataset_name since the dataset_name
+    # in configs may be in lowercase but the name of dataset class should always
+    # start with an uppercase letter.
+    name = dataset_name.capitalize()
+    return DATASET_REGISTRY.get(name)(cfg, split)

TimeSformer/timesformer/datasets/cv2_transform.py ADDED Viewed

	@@ -0,0 +1,796 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+import numpy as np
+import cv2
+def clip_boxes_to_image(boxes, height, width):
+    """
+    Clip the boxes with the height and width of the image size.
+    Args:
+        boxes (ndarray): bounding boxes to peform crop. The dimension is
+        `num boxes` x 4.
+        height (int): the height of the image.
+        width (int): the width of the image.
+    Returns:
+        boxes (ndarray): cropped bounding boxes.
+    """
+    boxes[:, [0, 2]] = np.minimum(
+        width - 1.0, np.maximum(0.0, boxes[:, [0, 2]])
+    )
+    boxes[:, [1, 3]] = np.minimum(
+        height - 1.0, np.maximum(0.0, boxes[:, [1, 3]])
+    )
+    return boxes
+def random_short_side_scale_jitter_list(images, min_size, max_size, boxes=None):
+    """
+    Perform a spatial short scale jittering on the given images and
+    corresponding boxes.
+    Args:
+        images (list): list of images to perform scale jitter. Dimension is
+            `height` x `width` x `channel`.
+        min_size (int): the minimal size to scale the frames.
+        max_size (int): the maximal size to scale the frames.
+        boxes (list): optional. Corresponding boxes to images. Dimension is
+            `num boxes` x 4.
+    Returns:
+        (list): the list of scaled images with dimension of
+            `new height` x `new width` x `channel`.
+        (ndarray or None): the scaled boxes with dimension of
+            `num boxes` x 4.
+    """
+    size = int(round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size)))
+    height = images[0].shape[0]
+    width = images[0].shape[1]
+    if (width <= height and width == size) or (
+        height <= width and height == size
+    ):
+        return images, boxes
+    new_width = size
+    new_height = size
+    if width < height:
+        new_height = int(math.floor((float(height) / width) * size))
+        if boxes is not None:
+            boxes = [
+                proposal * float(new_height) / height for proposal in boxes
+            ]
+    else:
+        new_width = int(math.floor((float(width) / height) * size))
+        if boxes is not None:
+            boxes = [proposal * float(new_width) / width for proposal in boxes]
+    return (
+        [
+            cv2.resize(
+                image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
+            ).astype(np.float32)
+            for image in images
+        ],
+        boxes,
+    )
+def scale(size, image):
+    """
+    Scale the short side of the image to size.
+    Args:
+        size (int): size to scale the image.
+        image (array): image to perform short side scale. Dimension is
+            `height` x `width` x `channel`.
+    Returns:
+        (ndarray): the scaled image with dimension of
+            `height` x `width` x `channel`.
+    """
+    height = image.shape[0]
+    width = image.shape[1]
+    if (width <= height and width == size) or (
+        height <= width and height == size
+    ):
+        return image
+    new_width = size
+    new_height = size
+    if width < height:
+        new_height = int(math.floor((float(height) / width) * size))
+    else:
+        new_width = int(math.floor((float(width) / height) * size))
+    img = cv2.resize(
+        image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
+    )
+    return img.astype(np.float32)
+def scale_boxes(size, boxes, height, width):
+    """
+    Scale the short side of the box to size.
+    Args:
+        size (int): size to scale the image.
+        boxes (ndarray): bounding boxes to peform scale. The dimension is
+        `num boxes` x 4.
+        height (int): the height of the image.
+        width (int): the width of the image.
+    Returns:
+        boxes (ndarray): scaled bounding boxes.
+    """
+    if (width <= height and width == size) or (
+        height <= width and height == size
+    ):
+        return boxes
+    new_width = size
+    new_height = size
+    if width < height:
+        new_height = int(math.floor((float(height) / width) * size))
+        boxes *= float(new_height) / height
+    else:
+        new_width = int(math.floor((float(width) / height) * size))
+        boxes *= float(new_width) / width
+    return boxes
+def horizontal_flip_list(prob, images, order="CHW", boxes=None):
+    """
+    Horizontally flip the list of image and optional boxes.
+    Args:
+        prob (float): probability to flip.
+        image (list): ilist of images to perform short side scale. Dimension is
+            `height` x `width` x `channel` or `channel` x `height` x `width`.
+        order (str): order of the `height`, `channel` and `width`.
+        boxes (list): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+    Returns:
+        (ndarray): the scaled image with dimension of
+            `height` x `width` x `channel`.
+        (list): optional. Corresponding boxes to images. Dimension is
+            `num boxes` x 4.
+    """
+    _, width, _ = images[0].shape
+    if np.random.uniform() < prob:
+        if boxes is not None:
+            boxes = [flip_boxes(proposal, width) for proposal in boxes]
+        if order == "CHW":
+            out_images = []
+            for image in images:
+                image = np.asarray(image).swapaxes(2, 0)
+                image = image[::-1]
+                out_images.append(image.swapaxes(0, 2))
+            return out_images, boxes
+        elif order == "HWC":
+            return [cv2.flip(image, 1) for image in images], boxes
+    return images, boxes
+def spatial_shift_crop_list(size, images, spatial_shift_pos, boxes=None):
+    """
+    Perform left, center, or right crop of the given list of images.
+    Args:
+        size (int): size to crop.
+        image (list): ilist of images to perform short side scale. Dimension is
+            `height` x `width` x `channel` or `channel` x `height` x `width`.
+        spatial_shift_pos (int): option includes 0 (left), 1 (middle), and
+            2 (right) crop.
+        boxes (list): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+    Returns:
+        cropped (ndarray): the cropped list of images with dimension of
+            `height` x `width` x `channel`.
+        boxes (list): optional. Corresponding boxes to images. Dimension is
+            `num boxes` x 4.
+    """
+    assert spatial_shift_pos in [0, 1, 2]
+    height = images[0].shape[0]
+    width = images[0].shape[1]
+    y_offset = int(math.ceil((height - size) / 2))
+    x_offset = int(math.ceil((width - size) / 2))
+    if height > width:
+        if spatial_shift_pos == 0:
+            y_offset = 0
+        elif spatial_shift_pos == 2:
+            y_offset = height - size
+    else:
+        if spatial_shift_pos == 0:
+            x_offset = 0
+        elif spatial_shift_pos == 2:
+            x_offset = width - size
+    cropped = [
+        image[y_offset : y_offset + size, x_offset : x_offset + size, :]
+        for image in images
+    ]
+    assert cropped[0].shape[0] == size, "Image height not cropped properly"
+    assert cropped[0].shape[1] == size, "Image width not cropped properly"
+    if boxes is not None:
+        for i in range(len(boxes)):
+            boxes[i][:, [0, 2]] -= x_offset
+            boxes[i][:, [1, 3]] -= y_offset
+    return cropped, boxes
+def CHW2HWC(image):
+    """
+    Transpose the dimension from `channel` x `height` x `width` to
+        `height` x `width` x `channel`.
+    Args:
+        image (array): image to transpose.
+    Returns
+        (array): transposed image.
+    """
+    return image.transpose([1, 2, 0])
+def HWC2CHW(image):
+    """
+    Transpose the dimension from `height` x `width` x `channel` to
+        `channel` x `height` x `width`.
+    Args:
+        image (array): image to transpose.
+    Returns
+        (array): transposed image.
+    """
+    return image.transpose([2, 0, 1])
+def color_jitter_list(
+    images, img_brightness=0, img_contrast=0, img_saturation=0
+):
+    """
+    Perform color jitter on the list of images.
+    Args:
+        images (list): list of images to perform color jitter.
+        img_brightness (float): jitter ratio for brightness.
+        img_contrast (float): jitter ratio for contrast.
+        img_saturation (float): jitter ratio for saturation.
+    Returns:
+        images (list): the jittered list of images.
+    """
+    jitter = []
+    if img_brightness != 0:
+        jitter.append("brightness")
+    if img_contrast != 0:
+        jitter.append("contrast")
+    if img_saturation != 0:
+        jitter.append("saturation")
+    if len(jitter) > 0:
+        order = np.random.permutation(np.arange(len(jitter)))
+        for idx in range(0, len(jitter)):
+            if jitter[order[idx]] == "brightness":
+                images = brightness_list(img_brightness, images)
+            elif jitter[order[idx]] == "contrast":
+                images = contrast_list(img_contrast, images)
+            elif jitter[order[idx]] == "saturation":
+                images = saturation_list(img_saturation, images)
+    return images
+def lighting_list(imgs, alphastd, eigval, eigvec, alpha=None):
+    """
+    Perform AlexNet-style PCA jitter on the given list of images.
+    Args:
+        images (list): list of images to perform lighting jitter.
+        alphastd (float): jitter ratio for PCA jitter.
+        eigval (list): eigenvalues for PCA jitter.
+        eigvec (list[list]): eigenvectors for PCA jitter.
+    Returns:
+        out_images (list): the list of jittered images.
+    """
+    if alphastd == 0:
+        return imgs
+    # generate alpha1, alpha2, alpha3
+    alpha = np.random.normal(0, alphastd, size=(1, 3))
+    eig_vec = np.array(eigvec)
+    eig_val = np.reshape(eigval, (1, 3))
+    rgb = np.sum(
+        eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0),
+        axis=1,
+    )
+    out_images = []
+    for img in imgs:
+        for idx in range(img.shape[0]):
+            img[idx] = img[idx] + rgb[2 - idx]
+        out_images.append(img)
+    return out_images
+def color_normalization(image, mean, stddev):
+    """
+    Perform color normalization on the image with the given mean and stddev.
+    Args:
+        image (array): image to perform color normalization.
+        mean (float): mean value to subtract.
+        stddev (float): stddev to devide.
+    """
+    # Input image should in format of CHW
+    assert len(mean) == image.shape[0], "channel mean not computed properly"
+    assert len(stddev) == image.shape[0], "channel stddev not computed properly"
+    for idx in range(image.shape[0]):
+        image[idx] = image[idx] - mean[idx]
+        image[idx] = image[idx] / stddev[idx]
+    return image
+def pad_image(image, pad_size, order="CHW"):
+    """
+    Pad the given image with the size of pad_size.
+    Args:
+        image (array): image to pad.
+        pad_size (int): size to pad.
+        order (str): order of the `height`, `channel` and `width`.
+    Returns:
+        img (array): padded image.
+    """
+    if order == "CHW":
+        img = np.pad(
+            image,
+            ((0, 0), (pad_size, pad_size), (pad_size, pad_size)),
+            mode=str("constant"),
+        )
+    elif order == "HWC":
+        img = np.pad(
+            image,
+            ((pad_size, pad_size), (pad_size, pad_size), (0, 0)),
+            mode=str("constant"),
+        )
+    return img
+def horizontal_flip(prob, image, order="CHW"):
+    """
+    Horizontally flip the image.
+    Args:
+        prob (float): probability to flip.
+        image (array): image to pad.
+        order (str): order of the `height`, `channel` and `width`.
+    Returns:
+        img (array): flipped image.
+    """
+    assert order in ["CHW", "HWC"], "order {} is not supported".format(order)
+    if np.random.uniform() < prob:
+        if order == "CHW":
+            image = image[:, :, ::-1]
+        elif order == "HWC":
+            image = image[:, ::-1, :]
+        else:
+            raise NotImplementedError("Unknown order {}".format(order))
+    return image
+def flip_boxes(boxes, im_width):
+    """
+    Horizontally flip the boxes.
+    Args:
+        boxes (array): box to flip.
+        im_width (int): width of the image.
+    Returns:
+        boxes_flipped (array): flipped box.
+    """
+    boxes_flipped = boxes.copy()
+    boxes_flipped[:, 0::4] = im_width - boxes[:, 2::4] - 1
+    boxes_flipped[:, 2::4] = im_width - boxes[:, 0::4] - 1
+    return boxes_flipped
+def crop_boxes(boxes, x_offset, y_offset):
+    """
+    Crop the boxes given the offsets.
+    Args:
+        boxes (array): boxes to crop.
+        x_offset (int): offset on x.
+        y_offset (int): offset on y.
+    """
+    boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
+    boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
+    return boxes
+def random_crop_list(images, size, pad_size=0, order="CHW", boxes=None):
+    """
+    Perform random crop on a list of images.
+    Args:
+        images (list): list of images to perform random crop.
+        size (int): size to crop.
+        pad_size (int): padding size.
+        order (str): order of the `height`, `channel` and `width`.
+        boxes (list): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+    Returns:
+        cropped (ndarray): the cropped list of images with dimension of
+            `height` x `width` x `channel`.
+        boxes (list): optional. Corresponding boxes to images. Dimension is
+            `num boxes` x 4.
+    """
+    # explicitly dealing processing per image order to avoid flipping images.
+    if pad_size > 0:
+        images = [
+            pad_image(pad_size=pad_size, image=image, order=order)
+            for image in images
+        ]
+    # image format should be CHW.
+    if order == "CHW":
+        if images[0].shape[1] == size and images[0].shape[2] == size:
+            return images, boxes
+        height = images[0].shape[1]
+        width = images[0].shape[2]
+        y_offset = 0
+        if height > size:
+            y_offset = int(np.random.randint(0, height - size))
+        x_offset = 0
+        if width > size:
+            x_offset = int(np.random.randint(0, width - size))
+        cropped = [
+            image[:, y_offset : y_offset + size, x_offset : x_offset + size]
+            for image in images
+        ]
+        assert cropped[0].shape[1] == size, "Image not cropped properly"
+        assert cropped[0].shape[2] == size, "Image not cropped properly"
+    elif order == "HWC":
+        if images[0].shape[0] == size and images[0].shape[1] == size:
+            return images, boxes
+        height = images[0].shape[0]
+        width = images[0].shape[1]
+        y_offset = 0
+        if height > size:
+            y_offset = int(np.random.randint(0, height - size))
+        x_offset = 0
+        if width > size:
+            x_offset = int(np.random.randint(0, width - size))
+        cropped = [
+            image[y_offset : y_offset + size, x_offset : x_offset + size, :]
+            for image in images
+        ]
+        assert cropped[0].shape[0] == size, "Image not cropped properly"
+        assert cropped[0].shape[1] == size, "Image not cropped properly"
+    if boxes is not None:
+        boxes = [crop_boxes(proposal, x_offset, y_offset) for proposal in boxes]
+    return cropped, boxes
+def center_crop(size, image):
+    """
+    Perform center crop on input images.
+    Args:
+        size (int): size of the cropped height and width.
+        image (array): the image to perform center crop.
+    """
+    height = image.shape[0]
+    width = image.shape[1]
+    y_offset = int(math.ceil((height - size) / 2))
+    x_offset = int(math.ceil((width - size) / 2))
+    cropped = image[y_offset : y_offset + size, x_offset : x_offset + size, :]
+    assert cropped.shape[0] == size, "Image height not cropped properly"
+    assert cropped.shape[1] == size, "Image width not cropped properly"
+    return cropped
+# ResNet style scale jittering: randomly select the scale from
+# [1/max_size, 1/min_size]
+def random_scale_jitter(image, min_size, max_size):
+    """
+    Perform ResNet style random scale jittering: randomly select the scale from
+        [1/max_size, 1/min_size].
+    Args:
+        image (array): image to perform random scale.
+        min_size (int): min size to scale.
+        max_size (int) max size to scale.
+    Returns:
+        image (array): scaled image.
+    """
+    img_scale = int(
+        round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size))
+    )
+    image = scale(img_scale, image)
+    return image
+def random_scale_jitter_list(images, min_size, max_size):
+    """
+    Perform ResNet style random scale jittering on a list of image: randomly
+        select the scale from [1/max_size, 1/min_size]. Note that all the image
+        will share the same scale.
+    Args:
+        images (list): list of images to perform random scale.
+        min_size (int): min size to scale.
+        max_size (int) max size to scale.
+    Returns:
+        images (list): list of scaled image.
+    """
+    img_scale = int(
+        round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size))
+    )
+    return [scale(img_scale, image) for image in images]
+def random_sized_crop(image, size, area_frac=0.08):
+    """
+    Perform random sized cropping on the given image. Random crop with size
+        8% - 100% image area and aspect ratio in [3/4, 4/3].
+    Args:
+        image (array): image to crop.
+        size (int): size to crop.
+        area_frac (float): area of fraction.
+    Returns:
+        (array): cropped image.
+    """
+    for _ in range(0, 10):
+        height = image.shape[0]
+        width = image.shape[1]
+        area = height * width
+        target_area = np.random.uniform(area_frac, 1.0) * area
+        aspect_ratio = np.random.uniform(3.0 / 4.0, 4.0 / 3.0)
+        w = int(round(math.sqrt(float(target_area) * aspect_ratio)))
+        h = int(round(math.sqrt(float(target_area) / aspect_ratio)))
+        if np.random.uniform() < 0.5:
+            w, h = h, w
+        if h <= height and w <= width:
+            if height == h:
+                y_offset = 0
+            else:
+                y_offset = np.random.randint(0, height - h)
+            if width == w:
+                x_offset = 0
+            else:
+                x_offset = np.random.randint(0, width - w)
+            y_offset = int(y_offset)
+            x_offset = int(x_offset)
+            cropped = image[y_offset : y_offset + h, x_offset : x_offset + w, :]
+            assert (
+                cropped.shape[0] == h and cropped.shape[1] == w
+            ), "Wrong crop size"
+            cropped = cv2.resize(
+                cropped, (size, size), interpolation=cv2.INTER_LINEAR
+            )
+            return cropped.astype(np.float32)
+    return center_crop(size, scale(size, image))
+def lighting(img, alphastd, eigval, eigvec):
+    """
+    Perform AlexNet-style PCA jitter on the given image.
+    Args:
+        image (array): list of images to perform lighting jitter.
+        alphastd (float): jitter ratio for PCA jitter.
+        eigval (array): eigenvalues for PCA jitter.
+        eigvec (list): eigenvectors for PCA jitter.
+    Returns:
+        img (tensor): the jittered image.
+    """
+    if alphastd == 0:
+        return img
+    # generate alpha1, alpha2, alpha3.
+    alpha = np.random.normal(0, alphastd, size=(1, 3))
+    eig_vec = np.array(eigvec)
+    eig_val = np.reshape(eigval, (1, 3))
+    rgb = np.sum(
+        eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0),
+        axis=1,
+    )
+    for idx in range(img.shape[0]):
+        img[idx] = img[idx] + rgb[2 - idx]
+    return img
+def random_sized_crop_list(images, size, crop_area_fraction=0.08):
+    """
+    Perform random sized cropping on the given list of images. Random crop with
+        size 8% - 100% image area and aspect ratio in [3/4, 4/3].
+    Args:
+        images (list): image to crop.
+        size (int): size to crop.
+        area_frac (float): area of fraction.
+    Returns:
+        (list): list of cropped image.
+    """
+    for _ in range(0, 10):
+        height = images[0].shape[0]
+        width = images[0].shape[1]
+        area = height * width
+        target_area = np.random.uniform(crop_area_fraction, 1.0) * area
+        aspect_ratio = np.random.uniform(3.0 / 4.0, 4.0 / 3.0)
+        w = int(round(math.sqrt(float(target_area) * aspect_ratio)))
+        h = int(round(math.sqrt(float(target_area) / aspect_ratio)))
+        if np.random.uniform() < 0.5:
+            w, h = h, w
+        if h <= height and w <= width:
+            if height == h:
+                y_offset = 0
+            else:
+                y_offset = np.random.randint(0, height - h)
+            if width == w:
+                x_offset = 0
+            else:
+                x_offset = np.random.randint(0, width - w)
+            y_offset = int(y_offset)
+            x_offset = int(x_offset)
+            croppsed_images = []
+            for image in images:
+                cropped = image[
+                    y_offset : y_offset + h, x_offset : x_offset + w, :
+                ]
+                assert (
+                    cropped.shape[0] == h and cropped.shape[1] == w
+                ), "Wrong crop size"
+                cropped = cv2.resize(
+                    cropped, (size, size), interpolation=cv2.INTER_LINEAR
+                )
+                croppsed_images.append(cropped.astype(np.float32))
+            return croppsed_images
+    return [center_crop(size, scale(size, image)) for image in images]
+def blend(image1, image2, alpha):
+    return image1 * alpha + image2 * (1 - alpha)
+def grayscale(image):
+    """
+    Convert the image to gray scale.
+    Args:
+        image (tensor): image to convert to gray scale. Dimension is
+            `channel` x `height` x `width`.
+    Returns:
+        img_gray (tensor): image in gray scale.
+    """
+    # R -> 0.299, G -> 0.587, B -> 0.114.
+    img_gray = np.copy(image)
+    gray_channel = 0.299 * image[2] + 0.587 * image[1] + 0.114 * image[0]
+    img_gray[0] = gray_channel
+    img_gray[1] = gray_channel
+    img_gray[2] = gray_channel
+    return img_gray
+def saturation(var, image):
+    """
+    Perform color saturation on the given image.
+    Args:
+        var (float): variance.
+        image (array): image to perform color saturation.
+    Returns:
+        (array): image that performed color saturation.
+    """
+    img_gray = grayscale(image)
+    alpha = 1.0 + np.random.uniform(-var, var)
+    return blend(image, img_gray, alpha)
+def brightness(var, image):
+    """
+    Perform color brightness on the given image.
+    Args:
+        var (float): variance.
+        image (array): image to perform color brightness.
+    Returns:
+        (array): image that performed color brightness.
+    """
+    img_bright = np.zeros(image.shape).astype(image.dtype)
+    alpha = 1.0 + np.random.uniform(-var, var)
+    return blend(image, img_bright, alpha)
+def contrast(var, image):
+    """
+    Perform color contrast on the given image.
+    Args:
+        var (float): variance.
+        image (array): image to perform color contrast.
+    Returns:
+        (array): image that performed color contrast.
+    """
+    img_gray = grayscale(image)
+    img_gray.fill(np.mean(img_gray[0]))
+    alpha = 1.0 + np.random.uniform(-var, var)
+    return blend(image, img_gray, alpha)
+def saturation_list(var, images):
+    """
+    Perform color saturation on the list of given images.
+    Args:
+        var (float): variance.
+        images (list): list of images to perform color saturation.
+    Returns:
+        (list): list of images that performed color saturation.
+    """
+    alpha = 1.0 + np.random.uniform(-var, var)
+    out_images = []
+    for image in images:
+        img_gray = grayscale(image)
+        out_images.append(blend(image, img_gray, alpha))
+    return out_images
+def brightness_list(var, images):
+    """
+    Perform color brightness on the given list of images.
+    Args:
+        var (float): variance.
+        images (list): list of images to perform color brightness.
+    Returns:
+        (array): list of images that performed color brightness.
+    """
+    alpha = 1.0 + np.random.uniform(-var, var)
+    out_images = []
+    for image in images:
+        img_bright = np.zeros(image.shape).astype(image.dtype)
+        out_images.append(blend(image, img_bright, alpha))
+    return out_images
+def contrast_list(var, images):
+    """
+    Perform color contrast on the given list of images.
+    Args:
+        var (float): variance.
+        images (list): list of images to perform color contrast.
+    Returns:
+        (array): image that performed color contrast.
+    """
+    alpha = 1.0 + np.random.uniform(-var, var)
+    out_images = []
+    for image in images:
+        img_gray = grayscale(image)
+        img_gray.fill(np.mean(img_gray[0]))
+        out_images.append(blend(image, img_gray, alpha))
+    return out_images
+def color_jitter(image, img_brightness=0, img_contrast=0, img_saturation=0):
+    """
+    Perform color jitter on the given image.
+    Args:
+        image (array): image to perform color jitter.
+        img_brightness (float): jitter ratio for brightness.
+        img_contrast (float): jitter ratio for contrast.
+        img_saturation (float): jitter ratio for saturation.
+    Returns:
+        image (array): the jittered image.
+    """
+    jitter = []
+    if img_brightness != 0:
+        jitter.append("brightness")
+    if img_contrast != 0:
+        jitter.append("contrast")
+    if img_saturation != 0:
+        jitter.append("saturation")
+    if len(jitter) > 0:
+        order = np.random.permutation(np.arange(len(jitter)))
+        for idx in range(0, len(jitter)):
+            if jitter[order[idx]] == "brightness":
+                image = brightness(img_brightness, image)
+            elif jitter[order[idx]] == "contrast":
+                image = contrast(img_contrast, image)
+            elif jitter[order[idx]] == "saturation":
+                image = saturation(img_saturation, image)
+    return image
+def revert_scaled_boxes(size, boxes, img_height, img_width):
+    """
+    Revert scaled input boxes to match the original image size.
+    Args:
+        size (int): size of the cropped image.
+        boxes (array): shape (num_boxes, 4).
+        img_height (int): height of original image.
+        img_width (int): width of original image.
+    Returns:
+        reverted_boxes (array): boxes scaled back to the original image size.
+    """
+    scaled_aspect = np.min([img_height, img_width])
+    scale_ratio = scaled_aspect / size
+    reverted_boxes = boxes * scale_ratio
+    return reverted_boxes

TimeSformer/timesformer/datasets/decoder.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+import numpy as np
+import random
+import torch
+import torchvision.io as io
+def temporal_sampling(frames, start_idx, end_idx, num_samples):
+    """
+    Given the start and end frame index, sample num_samples frames between
+    the start and end with equal interval.
+    Args:
+        frames (tensor): a tensor of video frames, dimension is
+            `num video frames` x `channel` x `height` x `width`.
+        start_idx (int): the index of the start frame.
+        end_idx (int): the index of the end frame.
+        num_samples (int): number of frames to sample.
+    Returns:
+        frames (tersor): a tensor of temporal sampled video frames, dimension is
+            `num clip frames` x `channel` x `height` x `width`.
+    """
+    index = torch.linspace(start_idx, end_idx, num_samples)
+    index = torch.clamp(index, 0, frames.shape[0] - 1).long()
+    frames = torch.index_select(frames, 0, index)
+    return frames
+def get_start_end_idx(video_size, clip_size, clip_idx, num_clips):
+    """
+    Sample a clip of size clip_size from a video of size video_size and
+    return the indices of the first and last frame of the clip. If clip_idx is
+    -1, the clip is randomly sampled, otherwise uniformly split the video to
+    num_clips clips, and select the start and end index of clip_idx-th video
+    clip.
+    Args:
+        video_size (int): number of overall frames.
+        clip_size (int): size of the clip to sample from the frames.
+        clip_idx (int): if clip_idx is -1, perform random jitter sampling. If
+            clip_idx is larger than -1, uniformly split the video to num_clips
+            clips, and select the start and end index of the clip_idx-th video
+            clip.
+        num_clips (int): overall number of clips to uniformly sample from the
+            given video for testing.
+    Returns:
+        start_idx (int): the start frame index.
+        end_idx (int): the end frame index.
+    """
+    delta = max(video_size - clip_size, 0)
+    if clip_idx == -1:
+        # Random temporal sampling.
+        start_idx = random.uniform(0, delta)
+    else:
+        # Uniformly sample the clip with the given index.
+        start_idx = delta * clip_idx / num_clips
+    end_idx = start_idx + clip_size - 1
+    return start_idx, end_idx
+def pyav_decode_stream(
+    container, start_pts, end_pts, stream, stream_name, buffer_size=0
+):
+    """
+    Decode the video with PyAV decoder.
+    Args:
+        container (container): PyAV container.
+        start_pts (int): the starting Presentation TimeStamp to fetch the
+            video frames.
+        end_pts (int): the ending Presentation TimeStamp of the decoded frames.
+        stream (stream): PyAV stream.
+        stream_name (dict): a dictionary of streams. For example, {"video": 0}
+            means video stream at stream index 0.
+        buffer_size (int): number of additional frames to decode beyond end_pts.
+    Returns:
+        result (list): list of frames decoded.
+        max_pts (int): max Presentation TimeStamp of the video sequence.
+    """
+    # Seeking in the stream is imprecise. Thus, seek to an ealier PTS by a
+    # margin pts.
+    margin = 1024
+    seek_offset = max(start_pts - margin, 0)
+    container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
+    frames = {}
+    buffer_count = 0
+    max_pts = 0
+    for frame in container.decode(**stream_name):
+        max_pts = max(max_pts, frame.pts)
+        if frame.pts < start_pts:
+            continue
+        if frame.pts <= end_pts:
+            frames[frame.pts] = frame
+        else:
+            buffer_count += 1
+            frames[frame.pts] = frame
+            if buffer_count >= buffer_size:
+                break
+    result = [frames[pts] for pts in sorted(frames)]
+    return result, max_pts
+def torchvision_decode(
+    video_handle,
+    sampling_rate,
+    num_frames,
+    clip_idx,
+    video_meta,
+    num_clips=10,
+    target_fps=30,
+    modalities=("visual",),
+    max_spatial_scale=0,
+):
+    """
+    If video_meta is not empty, perform temporal selective decoding to sample a
+    clip from the video with TorchVision decoder. If video_meta is empty, decode
+    the entire video and update the video_meta.
+    Args:
+        video_handle (bytes): raw bytes of the video file.
+        sampling_rate (int): frame sampling rate (interval between two sampled
+            frames).
+        num_frames (int): number of frames to sample.
+        clip_idx (int): if clip_idx is -1, perform random temporal
+            sampling. If clip_idx is larger than -1, uniformly split the
+            video to num_clips clips, and select the clip_idx-th video clip.
+        video_meta (dict): a dict contains VideoMetaData. Details can be found
+            at `pytorch/vision/torchvision/io/_video_opt.py`.
+        num_clips (int): overall number of clips to uniformly sample from the
+            given video.
+        target_fps (int): the input video may has different fps, convert it to
+            the target video fps.
+        modalities (tuple): tuple of modalities to decode. Currently only
+            support `visual`, planning to support `acoustic` soon.
+        max_spatial_scale (int): the maximal resolution of the spatial shorter
+            edge size during decoding.
+    Returns:
+        frames (tensor): decoded frames from the video.
+        fps (float): the number of frames per second of the video.
+        decode_all_video (bool): if True, the entire video was decoded.
+    """
+    # Convert the bytes to a tensor.
+    video_tensor = torch.from_numpy(np.frombuffer(video_handle, dtype=np.uint8))
+    decode_all_video = True
+    video_start_pts, video_end_pts = 0, -1
+    # The video_meta is empty, fetch the meta data from the raw video.
+    if len(video_meta) == 0:
+        # Tracking the meta info for selective decoding in the future.
+        meta = io._probe_video_from_memory(video_tensor)
+        # Using the information from video_meta to perform selective decoding.
+        video_meta["video_timebase"] = meta.video_timebase
+        video_meta["video_numerator"] = meta.video_timebase.numerator
+        video_meta["video_denominator"] = meta.video_timebase.denominator
+        video_meta["has_video"] = meta.has_video
+        video_meta["video_duration"] = meta.video_duration
+        video_meta["video_fps"] = meta.video_fps
+        video_meta["audio_timebas"] = meta.audio_timebase
+        video_meta["audio_numerator"] = meta.audio_timebase.numerator
+        video_meta["audio_denominator"] = meta.audio_timebase.denominator
+        video_meta["has_audio"] = meta.has_audio
+        video_meta["audio_duration"] = meta.audio_duration
+        video_meta["audio_sample_rate"] = meta.audio_sample_rate
+    fps = video_meta["video_fps"]
+    if (
+        video_meta["has_video"]
+        and video_meta["video_denominator"] > 0
+        and video_meta["video_duration"] > 0
+    ):
+        # try selective decoding.
+        decode_all_video = False
+        clip_size = sampling_rate * num_frames / target_fps * fps
+        start_idx, end_idx = get_start_end_idx(
+            fps * video_meta["video_duration"], clip_size, clip_idx, num_clips
+        )
+        # Convert frame index to pts.
+        pts_per_frame = video_meta["video_denominator"] / fps
+        video_start_pts = int(start_idx * pts_per_frame)
+        video_end_pts = int(end_idx * pts_per_frame)
+    # Decode the raw video with the tv decoder.
+    v_frames, _ = io._read_video_from_memory(
+        video_tensor,
+        seek_frame_margin=1.0,
+        read_video_stream="visual" in modalities,
+        video_width=0,
+        video_height=0,
+        video_min_dimension=max_spatial_scale,
+        video_pts_range=(video_start_pts, video_end_pts),
+        video_timebase_numerator=video_meta["video_numerator"],
+        video_timebase_denominator=video_meta["video_denominator"],
+    )
+    if v_frames.shape == torch.Size([0]):
+        # failed selective decoding
+        decode_all_video = True
+        video_start_pts, video_end_pts = 0, -1
+        v_frames, _ = io._read_video_from_memory(
+            video_tensor,
+            seek_frame_margin=1.0,
+            read_video_stream="visual" in modalities,
+            video_width=0,
+            video_height=0,
+            video_min_dimension=max_spatial_scale,
+            video_pts_range=(video_start_pts, video_end_pts),
+            video_timebase_numerator=video_meta["video_numerator"],
+            video_timebase_denominator=video_meta["video_denominator"],
+        )
+    return v_frames, fps, decode_all_video
+def pyav_decode(
+    container, sampling_rate, num_frames, clip_idx, num_clips=10, target_fps=30, start=None, end=None
+, duration=None, frames_length=None):
+    """
+    Convert the video from its original fps to the target_fps. If the video
+    support selective decoding (contain decoding information in the video head),
+    the perform temporal selective decoding and sample a clip from the video
+    with the PyAV decoder. If the video does not support selective decoding,
+    decode the entire video.
+    Args:
+        container (container): pyav container.
+        sampling_rate (int): frame sampling rate (interval between two sampled
+            frames.
+        num_frames (int): number of frames to sample.
+        clip_idx (int): if clip_idx is -1, perform random temporal sampling. If
+            clip_idx is larger than -1, uniformly split the video to num_clips
+            clips, and select the clip_idx-th video clip.
+        num_clips (int): overall number of clips to uniformly sample from the
+            given video.
+        target_fps (int): the input video may has different fps, convert it to
+            the target video fps before frame sampling.
+    Returns:
+        frames (tensor): decoded frames from the video. Return None if the no
+            video stream was found.
+        fps (float): the number of frames per second of the video.
+        decode_all_video (bool): If True, the entire video was decoded.
+    """
+    # Try to fetch the decoding information from the video head. Some of the
+    # videos does not support fetching the decoding information, for that case
+    # it will get None duration.
+    fps = float(container.streams.video[0].average_rate)
+    orig_duration = duration
+    tb = float(container.streams.video[0].time_base)
+    frames_length = container.streams.video[0].frames
+    duration = container.streams.video[0].duration
+    if duration is None and orig_duration is not None:
+       duration = orig_duration / tb
+    if duration is None:
+        # If failed to fetch the decoding information, decode the entire video.
+        decode_all_video = True
+        video_start_pts, video_end_pts = 0, math.inf
+    else:
+        # Perform selective decoding.
+        decode_all_video = False
+        start_idx, end_idx = get_start_end_idx(
+            frames_length,
+            sampling_rate * num_frames / target_fps * fps,
+            clip_idx,
+            num_clips,
+        )
+        timebase = duration / frames_length
+        video_start_pts = int(start_idx * timebase)
+        video_end_pts = int(end_idx * timebase)
+    if start is not None and end is not None:
+        decode_all_video = False
+    frames = None
+    # If video stream was found, fetch video frames from the video.
+    if container.streams.video:
+        if start is None and end is None:
+            video_frames, max_pts = pyav_decode_stream(
+                container,
+                video_start_pts,
+                video_end_pts,
+                container.streams.video[0],
+                {"video": 0},
+            )
+        else:
+            timebase = duration / frames_length
+            start_i = start
+            end_i = end
+            video_frames, max_pts = pyav_decode_stream(
+                container,
+                start_i,
+                end_i,
+                container.streams.video[0],
+                {"video": 0},
+            )
+        container.close()
+        frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
+        frames = torch.as_tensor(np.stack(frames))
+    return frames, fps, decode_all_video
+def decode(
+    container,
+    sampling_rate,
+    num_frames,
+    clip_idx=-1,
+    num_clips=10,
+    video_meta=None,
+    target_fps=30,
+    backend="pyav",
+    max_spatial_scale=0,
+    start=None,
+    end=None,
+    duration=None,
+    frames_length=None,
+):
+    """
+    Decode the video and perform temporal sampling.
+    Args:
+        container (container): pyav container.
+        sampling_rate (int): frame sampling rate (interval between two sampled
+            frames).
+        num_frames (int): number of frames to sample.
+        clip_idx (int): if clip_idx is -1, perform random temporal
+            sampling. If clip_idx is larger than -1, uniformly split the
+            video to num_clips clips, and select the
+            clip_idx-th video clip.
+        num_clips (int): overall number of clips to uniformly
+            sample from the given video.
+        video_meta (dict): a dict contains VideoMetaData. Details can be find
+            at `pytorch/vision/torchvision/io/_video_opt.py`.
+        target_fps (int): the input video may have different fps, convert it to
+            the target video fps before frame sampling.
+        backend (str): decoding backend includes `pyav` and `torchvision`. The
+            default one is `pyav`.
+        max_spatial_scale (int): keep the aspect ratio and resize the frame so
+            that shorter edge size is max_spatial_scale. Only used in
+            `torchvision` backend.
+    Returns:
+        frames (tensor): decoded frames from the video.
+    """
+    # Currently support two decoders: 1) PyAV, and 2) TorchVision.
+    assert clip_idx >= -1, "Not valied clip_idx {}".format(clip_idx)
+    try:
+        if backend == "pyav":
+            frames, fps, decode_all_video = pyav_decode(
+                container,
+                sampling_rate,
+                num_frames,
+                clip_idx,
+                num_clips,
+                target_fps,
+                start,
+                end,
+                duration,
+                frames_length,
+            )
+        elif backend == "torchvision":
+            frames, fps, decode_all_video = torchvision_decode(
+                container,
+                sampling_rate,
+                num_frames,
+                clip_idx,
+                video_meta,
+                num_clips,
+                target_fps,
+                ("visual",),
+                max_spatial_scale,
+            )
+        else:
+            raise NotImplementedError(
+                "Unknown decoding backend {}".format(backend)
+            )
+    except Exception as e:
+        print("Failed to decode by {} with exception: {}".format(backend, e))
+        return None
+    # Return None if the frames was not decoded successfully.
+    if frames is None or frames.size(0) == 0:
+        return None
+    clip_sz = sampling_rate * num_frames / target_fps * fps
+    start_idx, end_idx = get_start_end_idx(
+        frames.shape[0],
+        clip_sz,
+        clip_idx if decode_all_video else 0,
+        num_clips if decode_all_video else 1,
+    )
+    # Perform temporal sampling from the decoded video.
+    frames = temporal_sampling(frames, start_idx, end_idx, num_frames)
+    return frames

TimeSformer/timesformer/datasets/kinetics.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+import random
+import torch
+import torch.utils.data
+from fvcore.common.file_io import PathManager
+import timesformer.utils.logging as logging
+from . import decoder as decoder
+from . import utils as utils
+from . import video_container as container
+from .build import DATASET_REGISTRY
+logger = logging.get_logger(__name__)
+@DATASET_REGISTRY.register()
+class Kinetics(torch.utils.data.Dataset):
+    """
+    Kinetics video loader. Construct the Kinetics video loader, then sample
+    clips from the videos. For training and validation, a single clip is
+    randomly sampled from every video with random cropping, scaling, and
+    flipping. For testing, multiple clips are uniformaly sampled from every
+    video with uniform cropping. For uniform cropping, we take the left, center,
+    and right crop if the width is larger than height, or take top, center, and
+    bottom crop if the height is larger than the width.
+    """
+    def __init__(self, cfg, mode, num_retries=10):
+        """
+        Construct the Kinetics video loader with a given csv file. The format of
+        the csv file is:
+        ```
+        path_to_video_1 label_1
+        path_to_video_2 label_2
+        ...
+        path_to_video_N label_N
+        ```
+        Args:
+            cfg (CfgNode): configs.
+            mode (string): Options includes `train`, `val`, or `test` mode.
+                For the train and val mode, the data loader will take data
+                from the train or val set, and sample one clip per video.
+                For the test mode, the data loader will take data from test set,
+                and sample multiple clips per video.
+            num_retries (int): number of retries.
+        """
+        # Only support train, val, and test mode.
+        assert mode in [
+            "train",
+            "val",
+            "test",
+        ], "Split '{}' not supported for Kinetics".format(mode)
+        self.mode = mode
+        self.cfg = cfg
+        self._video_meta = {}
+        self._num_retries = num_retries
+        # For training or validation mode, one single clip is sampled from every
+        # video. For testing, NUM_ENSEMBLE_VIEWS clips are sampled from every
+        # video. For every clip, NUM_SPATIAL_CROPS is cropped spatially from
+        # the frames.
+        if self.mode in ["train", "val"]:
+            self._num_clips = 1
+        elif self.mode in ["test"]:
+            self._num_clips = (
+                cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS
+            )
+        logger.info("Constructing Kinetics {}...".format(mode))
+        self._construct_loader()
+    def _construct_loader(self):
+        """
+        Construct the video loader.
+        """
+        path_to_file = os.path.join(
+            self.cfg.DATA.PATH_TO_DATA_DIR, "{}.csv".format(self.mode)
+        )
+        assert PathManager.exists(path_to_file), "{} dir not found".format(
+            path_to_file
+        )
+        self._path_to_videos = []
+        self._labels = []
+        self._spatial_temporal_idx = []
+        with PathManager.open(path_to_file, "r") as f:
+            for clip_idx, path_label in enumerate(f.read().splitlines()):
+                assert (
+                    len(path_label.split(self.cfg.DATA.PATH_LABEL_SEPARATOR))
+                    == 2
+                )
+                path, label = path_label.split(
+                    self.cfg.DATA.PATH_LABEL_SEPARATOR
+                )
+                for idx in range(self._num_clips):
+                    self._path_to_videos.append(
+                        os.path.join(self.cfg.DATA.PATH_PREFIX, path)
+                    )
+                    self._labels.append(int(label))
+                    self._spatial_temporal_idx.append(idx)
+                    self._video_meta[clip_idx * self._num_clips + idx] = {}
+        assert (
+            len(self._path_to_videos) > 0
+        ), "Failed to load Kinetics split {} from {}".format(
+            self._split_idx, path_to_file
+        )
+        logger.info(
+            "Constructing kinetics dataloader (size: {}) from {}".format(
+                len(self._path_to_videos), path_to_file
+            )
+        )
+    def __getitem__(self, index):
+        """
+        Given the video index, return the list of frames, label, and video
+        index if the video can be fetched and decoded successfully, otherwise
+        repeatly find a random video that can be decoded as a replacement.
+        Args:
+            index (int): the video index provided by the pytorch sampler.
+        Returns:
+            frames (tensor): the frames of sampled from the video. The dimension
+                is `channel` x `num frames` x `height` x `width`.
+            label (int): the label of the current video.
+            index (int): if the video provided by pytorch sampler can be
+                decoded, then return the index of the video. If not, return the
+                index of the video replacement that can be decoded.
+        """
+        short_cycle_idx = None
+        # When short cycle is used, input index is a tupple.
+        if isinstance(index, tuple):
+            index, short_cycle_idx = index
+        if self.mode in ["train", "val"]:
+            # -1 indicates random sampling.
+            temporal_sample_index = -1
+            spatial_sample_index = -1
+            min_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[0]
+            max_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[1]
+            crop_size = self.cfg.DATA.TRAIN_CROP_SIZE
+            if short_cycle_idx in [0, 1]:
+                crop_size = int(
+                    round(
+                        self.cfg.MULTIGRID.SHORT_CYCLE_FACTORS[short_cycle_idx]
+                        * self.cfg.MULTIGRID.DEFAULT_S
+                    )
+                )
+            if self.cfg.MULTIGRID.DEFAULT_S > 0:
+                # Decreasing the scale is equivalent to using a larger "span"
+                # in a sampling grid.
+                min_scale = int(
+                    round(
+                        float(min_scale)
+                        * crop_size
+                        / self.cfg.MULTIGRID.DEFAULT_S
+                    )
+                )
+        elif self.mode in ["test"]:
+            temporal_sample_index = (
+                self._spatial_temporal_idx[index]
+                // self.cfg.TEST.NUM_SPATIAL_CROPS
+            )
+            # spatial_sample_index is in [0, 1, 2]. Corresponding to left,
+            # center, or right if width is larger than height, and top, middle,
+            # or bottom if height is larger than width.
+            spatial_sample_index = (
+                (
+                    self._spatial_temporal_idx[index]
+                    % self.cfg.TEST.NUM_SPATIAL_CROPS
+                )
+                if self.cfg.TEST.NUM_SPATIAL_CROPS > 1
+                else 1
+            )
+            min_scale, max_scale, crop_size = (
+                [self.cfg.DATA.TEST_CROP_SIZE] * 3
+                if self.cfg.TEST.NUM_SPATIAL_CROPS > 1
+                else [self.cfg.DATA.TRAIN_JITTER_SCALES[0]] * 2
+                + [self.cfg.DATA.TEST_CROP_SIZE]
+            )
+            # The testing is deterministic and no jitter should be performed.
+            # min_scale, max_scale, and crop_size are expect to be the same.
+            assert len({min_scale, max_scale}) == 1
+        else:
+            raise NotImplementedError(
+                "Does not support {} mode".format(self.mode)
+            )
+        sampling_rate = utils.get_random_sampling_rate(
+            self.cfg.MULTIGRID.LONG_CYCLE_SAMPLING_RATE,
+            self.cfg.DATA.SAMPLING_RATE,
+        )
+        # Try to decode and sample a clip from a video. If the video can not be
+        # decoded, repeatly find a random video replacement that can be decoded.
+        for i_try in range(self._num_retries):
+            video_container = None
+            try:
+                video_container = container.get_video_container(
+                    self._path_to_videos[index],
+                    self.cfg.DATA_LOADER.ENABLE_MULTI_THREAD_DECODE,
+                    self.cfg.DATA.DECODING_BACKEND,
+                )
+            except Exception as e:
+                logger.info(
+                    "Failed to load video from {} with error {}".format(
+                        self._path_to_videos[index], e
+                    )
+                )
+            # Select a random video if the current video was not able to access.
+            if video_container is None:
+                logger.warning(
+                    "Failed to meta load video idx {} from {}; trial {}".format(
+                        index, self._path_to_videos[index], i_try
+                    )
+                )
+                if self.mode not in ["test"] and i_try > self._num_retries // 2:
+                    # let's try another one
+                    index = random.randint(0, len(self._path_to_videos) - 1)
+                continue
+            # Decode video. Meta info is used to perform selective decoding.
+            frames = decoder.decode(
+                video_container,
+                sampling_rate,
+                self.cfg.DATA.NUM_FRAMES,
+                temporal_sample_index,
+                self.cfg.TEST.NUM_ENSEMBLE_VIEWS,
+                video_meta=self._video_meta[index],
+                target_fps=self.cfg.DATA.TARGET_FPS,
+                backend=self.cfg.DATA.DECODING_BACKEND,
+                max_spatial_scale=min_scale,
+            )
+            # If decoding failed (wrong format, video is too short, and etc),
+            # select another video.
+            if frames is None:
+                logger.warning(
+                    "Failed to decode video idx {} from {}; trial {}".format(
+                        index, self._path_to_videos[index], i_try
+                    )
+                )
+                if self.mode not in ["test"] and i_try > self._num_retries // 2:
+                    # let's try another one
+                    index = random.randint(0, len(self._path_to_videos) - 1)
+                continue
+            label = self._labels[index]
+            # Perform color normalization.
+            frames = utils.tensor_normalize(
+                frames, self.cfg.DATA.MEAN, self.cfg.DATA.STD
+            )
+            # T H W C -> C T H W.
+            frames = frames.permute(3, 0, 1, 2)
+            # Perform data augmentation.
+            frames = utils.spatial_sampling(
+                frames,
+                spatial_idx=spatial_sample_index,
+                min_scale=min_scale,
+                max_scale=max_scale,
+                crop_size=crop_size,
+                random_horizontal_flip=self.cfg.DATA.RANDOM_FLIP,
+                inverse_uniform_sampling=self.cfg.DATA.INV_UNIFORM_SAMPLE,
+            )
+            if not self.cfg.MODEL.ARCH in ['vit']:
+                frames = utils.pack_pathway_output(self.cfg, frames)
+            else:
+                # Perform temporal sampling from the fast pathway.
+                frames = torch.index_select(
+                     frames,
+                     1,
+                     torch.linspace(
+                         0, frames.shape[1] - 1, self.cfg.DATA.NUM_FRAMES
+                     ).long(),
+                )
+            return frames, label, index, {}
+        else:
+            raise RuntimeError(
+                "Failed to fetch video after {} retries.".format(
+                    self._num_retries
+                )
+            )
+    def __len__(self):
+        """
+        Returns:
+            (int): the number of videos in the dataset.
+        """
+        return len(self._path_to_videos)

TimeSformer/timesformer/datasets/loader.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Data loader."""
+import itertools
+import numpy as np
+import torch
+from torch.utils.data._utils.collate import default_collate
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data.sampler import RandomSampler
+from timesformer.datasets.multigrid_helper import ShortCycleBatchSampler
+from . import utils as utils
+from .build import build_dataset
+def detection_collate(batch):
+    """
+    Collate function for detection task. Concatanate bboxes, labels and
+    metadata from different samples in the first dimension instead of
+    stacking them to have a batch-size dimension.
+    Args:
+        batch (tuple or list): data batch to collate.
+    Returns:
+        (tuple): collated detection data batch.
+    """
+    inputs, labels, video_idx, extra_data = zip(*batch)
+    inputs, video_idx = default_collate(inputs), default_collate(video_idx)
+    labels = torch.tensor(np.concatenate(labels, axis=0)).float()
+    collated_extra_data = {}
+    for key in extra_data[0].keys():
+        data = [d[key] for d in extra_data]
+        if key == "boxes" or key == "ori_boxes":
+            # Append idx info to the bboxes before concatenating them.
+            bboxes = [
+                np.concatenate(
+                    [np.full((data[i].shape[0], 1), float(i)), data[i]], axis=1
+                )
+                for i in range(len(data))
+            ]
+            bboxes = np.concatenate(bboxes, axis=0)
+            collated_extra_data[key] = torch.tensor(bboxes).float()
+        elif key == "metadata":
+            collated_extra_data[key] = torch.tensor(
+                list(itertools.chain(*data))
+            ).view(-1, 2)
+        else:
+            collated_extra_data[key] = default_collate(data)
+    return inputs, labels, video_idx, collated_extra_data
+def construct_loader(cfg, split, is_precise_bn=False):
+    """
+    Constructs the data loader for the given dataset.
+    Args:
+        cfg (CfgNode): configs. Details can be found in
+            slowfast/config/defaults.py
+        split (str): the split of the data loader. Options include `train`,
+            `val`, and `test`.
+    """
+    assert split in ["train", "val", "test"]
+    if split in ["train"]:
+        dataset_name = cfg.TRAIN.DATASET
+        batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
+        shuffle = True
+        drop_last = True
+    elif split in ["val"]:
+        dataset_name = cfg.TRAIN.DATASET
+        batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
+        shuffle = False
+        drop_last = False
+    elif split in ["test"]:
+        dataset_name = cfg.TEST.DATASET
+        batch_size = int(cfg.TEST.BATCH_SIZE / max(1, cfg.NUM_GPUS))
+        shuffle = False
+        drop_last = False
+    # Construct the dataset
+    dataset = build_dataset(dataset_name, cfg, split)
+    if cfg.MULTIGRID.SHORT_CYCLE and split in ["train"] and not is_precise_bn:
+        # Create a sampler for multi-process training
+        sampler = utils.create_sampler(dataset, shuffle, cfg)
+        batch_sampler = ShortCycleBatchSampler(
+            sampler, batch_size=batch_size, drop_last=drop_last, cfg=cfg
+        )
+        # Create a loader
+        loader = torch.utils.data.DataLoader(
+            dataset,
+            batch_sampler=batch_sampler,
+            num_workers=cfg.DATA_LOADER.NUM_WORKERS,
+            pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
+            worker_init_fn=utils.loader_worker_init_fn(dataset),
+        )
+    else:
+        # Create a sampler for multi-process training
+        sampler = utils.create_sampler(dataset, shuffle, cfg)
+        # Create a loader
+        loader = torch.utils.data.DataLoader(
+            dataset,
+            batch_size=batch_size,
+            shuffle=(False if sampler else shuffle),
+            sampler=sampler,
+            num_workers=cfg.DATA_LOADER.NUM_WORKERS,
+            pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
+            drop_last=drop_last,
+            collate_fn=detection_collate if cfg.DETECTION.ENABLE else None,
+            worker_init_fn=utils.loader_worker_init_fn(dataset),
+        )
+    return loader
+def shuffle_dataset(loader, cur_epoch):
+    """ "
+    Shuffles the data.
+    Args:
+        loader (loader): data loader to perform shuffle.
+        cur_epoch (int): number of the current epoch.
+    """
+    sampler = (
+        loader.batch_sampler.sampler
+        if isinstance(loader.batch_sampler, ShortCycleBatchSampler)
+        else loader.sampler
+    )
+    assert isinstance(
+        sampler, (RandomSampler, DistributedSampler)
+    ), "Sampler type '{}' not supported".format(type(sampler))
+    # RandomSampler handles shuffling automatically
+    if isinstance(sampler, DistributedSampler):
+        # DistributedSampler shuffles data based on epoch
+        sampler.set_epoch(cur_epoch)

TimeSformer/timesformer/datasets/multigrid_helper.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Helper functions for multigrid training."""
+import numpy as np
+from torch._six import int_classes as _int_classes
+from torch.utils.data.sampler import Sampler
+class ShortCycleBatchSampler(Sampler):
+    """
+    Extend Sampler to support "short cycle" sampling.
+    See paper "A Multigrid Method for Efficiently Training Video Models",
+    Wu et al., 2019 (https://arxiv.org/abs/1912.00998) for details.
+    """
+    def __init__(self, sampler, batch_size, drop_last, cfg):
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        if (
+            not isinstance(batch_size, _int_classes)
+            or isinstance(batch_size, bool)
+            or batch_size <= 0
+        ):
+            raise ValueError(
+                "batch_size should be a positive integer value, "
+                "but got batch_size={}".format(batch_size)
+            )
+        if not isinstance(drop_last, bool):
+            raise ValueError(
+                "drop_last should be a boolean value, but got "
+                "drop_last={}".format(drop_last)
+            )
+        self.sampler = sampler
+        self.drop_last = drop_last
+        bs_factor = [
+            int(
+                round(
+                    (
+                        float(cfg.DATA.TRAIN_CROP_SIZE)
+                        / (s * cfg.MULTIGRID.DEFAULT_S)
+                    )
+                    ** 2
+                )
+            )
+            for s in cfg.MULTIGRID.SHORT_CYCLE_FACTORS
+        ]
+        self.batch_sizes = [
+            batch_size * bs_factor[0],
+            batch_size * bs_factor[1],
+            batch_size,
+        ]
+    def __iter__(self):
+        counter = 0
+        batch_size = self.batch_sizes[0]
+        batch = []
+        for idx in self.sampler:
+            batch.append((idx, counter % 3))
+            if len(batch) == batch_size:
+                yield batch
+                counter += 1
+                batch_size = self.batch_sizes[counter % 3]
+                batch = []
+        if len(batch) > 0 and not self.drop_last:
+            yield batch
+    def __len__(self):
+        avg_batch_size = sum(self.batch_sizes) / 3.0
+        if self.drop_last:
+            return int(np.floor(len(self.sampler) / avg_batch_size))
+        else:
+            return int(np.ceil(len(self.sampler) / avg_batch_size))

TimeSformer/timesformer/datasets/ssv2.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import json
+import numpy as np
+import os
+import random
+from itertools import chain as chain
+import torch
+import torch.utils.data
+from fvcore.common.file_io import PathManager
+import timesformer.utils.logging as logging
+from . import utils as utils
+from .build import DATASET_REGISTRY
+logger = logging.get_logger(__name__)
+@DATASET_REGISTRY.register()
+class Ssv2(torch.utils.data.Dataset):
+    """
+    Something-Something v2 (SSV2) video loader. Construct the SSV2 video loader,
+    then sample clips from the videos. For training and validation, a single
+    clip is randomly sampled from every video with random cropping, scaling, and
+    flipping. For testing, multiple clips are uniformaly sampled from every
+    video with uniform cropping. For uniform cropping, we take the left, center,
+    and right crop if the width is larger than height, or take top, center, and
+    bottom crop if the height is larger than the width.
+    """
+    def __init__(self, cfg, mode, num_retries=10):
+        """
+        Load Something-Something V2 data (frame paths, labels, etc. ) to a given
+        Dataset object. The dataset could be downloaded from Something-Something
+        official website (https://20bn.com/datasets/something-something).
+        Please see datasets/DATASET.md for more information about the data format.
+        Args:
+            cfg (CfgNode): configs.
+            mode (string): Options includes `train`, `val`, or `test` mode.
+                For the train and val mode, the data loader will take data
+                from the train or val set, and sample one clip per video.
+                For the test mode, the data loader will take data from test set,
+                and sample multiple clips per video.
+            num_retries (int): number of retries for reading frames from disk.
+        """
+        # Only support train, val, and test mode.
+        assert mode in [
+            "train",
+            "val",
+            "test",
+        ], "Split '{}' not supported for Something-Something V2".format(mode)
+        self.mode = mode
+        self.cfg = cfg
+        self._video_meta = {}
+        self._num_retries = num_retries
+        # For training or validation mode, one single clip is sampled from every
+        # video. For testing, NUM_ENSEMBLE_VIEWS clips are sampled from every
+        # video. For every clip, NUM_SPATIAL_CROPS is cropped spatially from
+        # the frames.
+        if self.mode in ["train", "val"]:
+            self._num_clips = 1
+        elif self.mode in ["test"]:
+            self._num_clips = (
+                cfg.TEST.NUM_ENSEMBLE_VIEWS * cfg.TEST.NUM_SPATIAL_CROPS
+            )
+        logger.info("Constructing Something-Something V2 {}...".format(mode))
+        self._construct_loader()
+    def _construct_loader(self):
+        """
+        Construct the video loader.
+        """
+        # Loading label names.
+        with PathManager.open(
+            os.path.join(
+                self.cfg.DATA.PATH_TO_DATA_DIR,
+                "something-something-v2-labels.json",
+            ),
+            "r",
+        ) as f:
+            label_dict = json.load(f)
+        # Loading labels.
+        label_file = os.path.join(
+            self.cfg.DATA.PATH_TO_DATA_DIR,
+            "something-something-v2-{}.json".format(
+                "train" if self.mode == "train" else "validation"
+            ),
+        )
+        with PathManager.open(label_file, "r") as f:
+            label_json = json.load(f)
+        self._video_names = []
+        self._labels = []
+        for video in label_json:
+            video_name = video["id"]
+            template = video["template"]
+            template = template.replace("[", "")
+            template = template.replace("]", "")
+            label = int(label_dict[template])
+            self._video_names.append(video_name)
+            self._labels.append(label)
+        path_to_file = os.path.join(
+            self.cfg.DATA.PATH_TO_DATA_DIR,
+            "{}.csv".format("train" if self.mode == "train" else "val"),
+        )
+        assert PathManager.exists(path_to_file), "{} dir not found".format(
+            path_to_file
+        )
+        self._path_to_videos, _ = utils.load_image_lists(
+            path_to_file, self.cfg.DATA.PATH_PREFIX
+        )
+        assert len(self._path_to_videos) == len(self._video_names), (
+            len(self._path_to_videos),
+            len(self._video_names),
+        )
+        # From dict to list.
+        new_paths, new_labels = [], []
+        for index in range(len(self._video_names)):
+            if self._video_names[index] in self._path_to_videos:
+                new_paths.append(self._path_to_videos[self._video_names[index]])
+                new_labels.append(self._labels[index])
+        self._labels = new_labels
+        self._path_to_videos = new_paths
+        # Extend self when self._num_clips > 1 (during testing).
+        self._path_to_videos = list(
+            chain.from_iterable(
+                [[x] * self._num_clips for x in self._path_to_videos]
+            )
+        )
+        self._labels = list(
+            chain.from_iterable([[x] * self._num_clips for x in self._labels])
+        )
+        self._spatial_temporal_idx = list(
+            chain.from_iterable(
+                [
+                    range(self._num_clips)
+                    for _ in range(len(self._path_to_videos))
+                ]
+            )
+        )
+        logger.info(
+            "Something-Something V2 dataloader constructed "
+            " (size: {}) from {}".format(
+                len(self._path_to_videos), path_to_file
+            )
+        )
+    def __getitem__(self, index):
+        """
+        Given the video index, return the list of frames, label, and video
+        index if the video frames can be fetched.
+        Args:
+            index (int): the video index provided by the pytorch sampler.
+        Returns:
+            frames (tensor): the frames of sampled from the video. The dimension
+                is `channel` x `num frames` x `height` x `width`.
+            label (int): the label of the current video.
+            index (int): the index of the video.
+        """
+        short_cycle_idx = None
+        # When short cycle is used, input index is a tupple.
+        if isinstance(index, tuple):
+            index, short_cycle_idx = index
+        if self.mode in ["train", "val"]: #or self.cfg.MODEL.ARCH in ['resformer', 'vit']:
+            # -1 indicates random sampling.
+            spatial_sample_index = -1
+            min_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[0]
+            max_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[1]
+            crop_size = self.cfg.DATA.TRAIN_CROP_SIZE
+            if short_cycle_idx in [0, 1]:
+                crop_size = int(
+                    round(
+                        self.cfg.MULTIGRID.SHORT_CYCLE_FACTORS[short_cycle_idx]
+                        * self.cfg.MULTIGRID.DEFAULT_S
+                    )
+                )
+            if self.cfg.MULTIGRID.DEFAULT_S > 0:
+                # Decreasing the scale is equivalent to using a larger "span"
+                # in a sampling grid.
+                min_scale = int(
+                    round(
+                        float(min_scale)
+                        * crop_size
+                        / self.cfg.MULTIGRID.DEFAULT_S
+                    )
+                )
+        elif self.mode in ["test"]:
+            # spatial_sample_index is in [0, 1, 2]. Corresponding to left,
+            # center, or right if width is larger than height, and top, middle,
+            # or bottom if height is larger than width.
+            spatial_sample_index = (
+                self._spatial_temporal_idx[index]
+                % self.cfg.TEST.NUM_SPATIAL_CROPS
+            )
+            if self.cfg.TEST.NUM_SPATIAL_CROPS == 1:
+                spatial_sample_index = 1
+            min_scale, max_scale, crop_size = [self.cfg.DATA.TEST_CROP_SIZE] * 3
+            # The testing is deterministic and no jitter should be performed.
+            # min_scale, max_scale, and crop_size are expect to be the same.
+            assert len({min_scale, max_scale, crop_size}) == 1
+        else:
+            raise NotImplementedError(
+                "Does not support {} mode".format(self.mode)
+            )
+        label = self._labels[index]
+        num_frames = self.cfg.DATA.NUM_FRAMES
+        video_length = len(self._path_to_videos[index])
+        seg_size = float(video_length - 1) / num_frames
+        seq = []
+        for i in range(num_frames):
+            start = int(np.round(seg_size * i))
+            end = int(np.round(seg_size * (i + 1)))
+            if self.mode == "train":
+                seq.append(random.randint(start, end))
+            else:
+                seq.append((start + end) // 2)
+        frames = torch.as_tensor(
+            utils.retry_load_images(
+                [self._path_to_videos[index][frame] for frame in seq],
+                self._num_retries,
+            )
+        )
+        # Perform color normalization.
+        frames = utils.tensor_normalize(
+            frames, self.cfg.DATA.MEAN, self.cfg.DATA.STD
+        )
+        # T H W C -> C T H W.
+        frames = frames.permute(3, 0, 1, 2)
+        frames = utils.spatial_sampling(
+            frames,
+            spatial_idx=spatial_sample_index,
+            min_scale=min_scale,
+            max_scale=max_scale,
+            crop_size=crop_size,
+            random_horizontal_flip=self.cfg.DATA.RANDOM_FLIP,
+            inverse_uniform_sampling=self.cfg.DATA.INV_UNIFORM_SAMPLE,
+        )
+        #if not self.cfg.RESFORMER.ACTIVE:
+        if not self.cfg.MODEL.ARCH in ['vit']:
+            frames = utils.pack_pathway_output(self.cfg, frames)
+        else:
+            # Perform temporal sampling from the fast pathway.
+            frames = torch.index_select(
+                 frames,
+                 1,
+                 torch.linspace(
+                     0, frames.shape[1] - 1, self.cfg.DATA.NUM_FRAMES
+                 ).long(),
+            )
+        return frames, label, index, {}
+    def __len__(self):
+        """
+        Returns:
+            (int): the number of videos in the dataset.
+        """
+        return len(self._path_to_videos)

TimeSformer/timesformer/datasets/transform.py ADDED Viewed

	@@ -0,0 +1,459 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+import numpy as np
+import torch
+def random_short_side_scale_jitter(
+    images, min_size, max_size, boxes=None, inverse_uniform_sampling=False
+):
+    """
+    Perform a spatial short scale jittering on the given images and
+    corresponding boxes.
+    Args:
+        images (tensor): images to perform scale jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+        min_size (int): the minimal size to scale the frames.
+        max_size (int): the maximal size to scale the frames.
+        boxes (ndarray): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+        inverse_uniform_sampling (bool): if True, sample uniformly in
+            [1 / max_scale, 1 / min_scale] and take a reciprocal to get the
+            scale. If False, take a uniform sample from [min_scale, max_scale].
+    Returns:
+        (tensor): the scaled images with dimension of
+            `num frames` x `channel` x `new height` x `new width`.
+        (ndarray or None): the scaled boxes with dimension of
+            `num boxes` x 4.
+    """
+    if inverse_uniform_sampling:
+        size = int(
+            round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size))
+        )
+    else:
+        size = int(round(np.random.uniform(min_size, max_size)))
+    height = images.shape[2]
+    width = images.shape[3]
+    if (width <= height and width == size) or (
+        height <= width and height == size
+    ):
+        return images, boxes
+    new_width = size
+    new_height = size
+    if width < height:
+        new_height = int(math.floor((float(height) / width) * size))
+        if boxes is not None:
+            boxes = boxes * float(new_height) / height
+    else:
+        new_width = int(math.floor((float(width) / height) * size))
+        if boxes is not None:
+            boxes = boxes * float(new_width) / width
+    return (
+        torch.nn.functional.interpolate(
+            images,
+            size=(new_height, new_width),
+            mode="bilinear",
+            align_corners=False,
+        ),
+        boxes,
+    )
+def crop_boxes(boxes, x_offset, y_offset):
+    """
+    Peform crop on the bounding boxes given the offsets.
+    Args:
+        boxes (ndarray or None): bounding boxes to peform crop. The dimension
+            is `num boxes` x 4.
+        x_offset (int): cropping offset in the x axis.
+        y_offset (int): cropping offset in the y axis.
+    Returns:
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    cropped_boxes = boxes.copy()
+    cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
+    cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
+    return cropped_boxes
+def random_crop(images, size, boxes=None):
+    """
+    Perform random spatial crop on the given images and corresponding boxes.
+    Args:
+        images (tensor): images to perform random crop. The dimension is
+            `num frames` x `channel` x `height` x `width`.
+        size (int): the size of height and width to crop on the image.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+    Returns:
+        cropped (tensor): cropped images with dimension of
+            `num frames` x `channel` x `size` x `size`.
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    if images.shape[2] == size and images.shape[3] == size:
+        return images, None
+    height = images.shape[2]
+    width = images.shape[3]
+    y_offset = 0
+    if height > size:
+        y_offset = int(np.random.randint(0, height - size))
+    x_offset = 0
+    if width > size:
+        x_offset = int(np.random.randint(0, width - size))
+    cropped = images[
+        :, :, y_offset : y_offset + size, x_offset : x_offset + size
+    ]
+    cropped_boxes = (
+        crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+    )
+    return cropped, cropped_boxes
+def horizontal_flip(prob, images, boxes=None):
+    """
+    Perform horizontal flip on the given images and corresponding boxes.
+    Args:
+        prob (float): probility to flip the images.
+        images (tensor): images to perform horizontal flip, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+    Returns:
+        images (tensor): images with dimension of
+            `num frames` x `channel` x `height` x `width`.
+        flipped_boxes (ndarray or None): the flipped boxes with dimension of
+            `num boxes` x 4.
+    """
+    if boxes is None:
+        flipped_boxes = None
+    else:
+        flipped_boxes = boxes.copy()
+    if np.random.uniform() < prob:
+        images = images.flip((-1))
+        width = images.shape[3]
+        if boxes is not None:
+            flipped_boxes[:, [0, 2]] = width - boxes[:, [2, 0]] - 1
+    return images, flipped_boxes
+def uniform_crop(images, size, spatial_idx, boxes=None):
+    """
+    Perform uniform spatial sampling on the images and corresponding boxes.
+    Args:
+        images (tensor): images to perform uniform crop. The dimension is
+            `num frames` x `channel` x `height` x `width`.
+        size (int): size of height and weight to crop the images.
+        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+            is larger than height. Or 0, 1, or 2 for top, center, and bottom
+            crop if height is larger than width.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+    Returns:
+        cropped (tensor): images with dimension of
+            `num frames` x `channel` x `size` x `size`.
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    assert spatial_idx in [0, 1, 2]
+    height = images.shape[2]
+    width = images.shape[3]
+    y_offset = int(math.ceil((height - size) / 2))
+    x_offset = int(math.ceil((width - size) / 2))
+    if height > width:
+        if spatial_idx == 0:
+            y_offset = 0
+        elif spatial_idx == 2:
+            y_offset = height - size
+    else:
+        if spatial_idx == 0:
+            x_offset = 0
+        elif spatial_idx == 2:
+            x_offset = width - size
+    cropped = images[
+        :, :, y_offset : y_offset + size, x_offset : x_offset + size
+    ]
+    cropped_boxes = (
+        crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+    )
+    return cropped, cropped_boxes
+def uniform_crop_2crops(images, size, spatial_idx, boxes=None):
+    """
+    Perform uniform spatial sampling on the images and corresponding boxes.
+    Args:
+        images (tensor): images to perform uniform crop. The dimension is
+            `num frames` x `channel` x `height` x `width`.
+        size (int): size of height and weight to crop the images.
+        spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
+            is larger than height. Or 0, 1, or 2 for top, center, and bottom
+            crop if height is larger than width.
+        boxes (ndarray or None): optional. Corresponding boxes to images.
+            Dimension is `num boxes` x 4.
+    Returns:
+        cropped (tensor): images with dimension of
+            `num frames` x `channel` x `size` x `size`.
+        cropped_boxes (ndarray or None): the cropped boxes with dimension of
+            `num boxes` x 4.
+    """
+    assert spatial_idx in [0, 1, 2]
+    height = images.shape[2]
+    width = images.shape[3]
+    if height > width:
+        x_offset = 0
+        if height > size * 2:
+            if spatial_idx == 0:
+                y_offset = int((height -  size * 2) // 2)
+            elif spatial_idx == 1:
+                y_offset = int(height - size - ((height -  size * 2) // 2))
+        else:
+            if spatial_idx == 0:
+                y_offset = 0
+            elif spatial_idx == 1:
+                y_offset = height - size
+    else:
+        y_offset = 0
+        if width > size * 2:
+            if spatial_idx == 0:
+                x_offset = int((width -  size * 2) // 2)
+            elif spatial_idx == 1:
+                x_offset = int(width - size - ((width -  size * 2) // 2))
+        else:
+            if spatial_idx == 0:
+                x_offset = 0
+            elif spatial_idx == 1:
+                x_offset = width - size
+    cropped = images[
+        :, :, y_offset : y_offset + size, x_offset : x_offset + size
+    ]
+    cropped_boxes = (
+        crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
+    )
+    return cropped, cropped_boxes
+def clip_boxes_to_image(boxes, height, width):
+    """
+    Clip an array of boxes to an image with the given height and width.
+    Args:
+        boxes (ndarray): bounding boxes to perform clipping.
+            Dimension is `num boxes` x 4.
+        height (int): given image height.
+        width (int): given image width.
+    Returns:
+        clipped_boxes (ndarray): the clipped boxes with dimension of
+            `num boxes` x 4.
+    """
+    clipped_boxes = boxes.copy()
+    clipped_boxes[:, [0, 2]] = np.minimum(
+        width - 1.0, np.maximum(0.0, boxes[:, [0, 2]])
+    )
+    clipped_boxes[:, [1, 3]] = np.minimum(
+        height - 1.0, np.maximum(0.0, boxes[:, [1, 3]])
+    )
+    return clipped_boxes
+def blend(images1, images2, alpha):
+    """
+    Blend two images with a given weight alpha.
+    Args:
+        images1 (tensor): the first images to be blended, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+        images2 (tensor): the second images to be blended, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+        alpha (float): the blending weight.
+    Returns:
+        (tensor): blended images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    return images1 * alpha + images2 * (1 - alpha)
+def grayscale(images):
+    """
+    Get the grayscale for the input images. The channels of images should be
+    in order BGR.
+    Args:
+        images (tensor): the input images for getting grayscale. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+    Returns:
+        img_gray (tensor): blended images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    # R -> 0.299, G -> 0.587, B -> 0.114.
+    img_gray = torch.tensor(images)
+    gray_channel = (
+        0.299 * images[:, 2] + 0.587 * images[:, 1] + 0.114 * images[:, 0]
+    )
+    img_gray[:, 0] = gray_channel
+    img_gray[:, 1] = gray_channel
+    img_gray[:, 2] = gray_channel
+    return img_gray
+def color_jitter(images, img_brightness=0, img_contrast=0, img_saturation=0):
+    """
+    Perfrom a color jittering on the input images. The channels of images
+    should be in order BGR.
+    Args:
+        images (tensor): images to perform color jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+        img_brightness (float): jitter ratio for brightness.
+        img_contrast (float): jitter ratio for contrast.
+        img_saturation (float): jitter ratio for saturation.
+    Returns:
+        images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    jitter = []
+    if img_brightness != 0:
+        jitter.append("brightness")
+    if img_contrast != 0:
+        jitter.append("contrast")
+    if img_saturation != 0:
+        jitter.append("saturation")
+    if len(jitter) > 0:
+        order = np.random.permutation(np.arange(len(jitter)))
+        for idx in range(0, len(jitter)):
+            if jitter[order[idx]] == "brightness":
+                images = brightness_jitter(img_brightness, images)
+            elif jitter[order[idx]] == "contrast":
+                images = contrast_jitter(img_contrast, images)
+            elif jitter[order[idx]] == "saturation":
+                images = saturation_jitter(img_saturation, images)
+    return images
+def brightness_jitter(var, images):
+    """
+    Perfrom brightness jittering on the input images. The channels of images
+    should be in order BGR.
+    Args:
+        var (float): jitter ratio for brightness.
+        images (tensor): images to perform color jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+    Returns:
+        images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    alpha = 1.0 + np.random.uniform(-var, var)
+    img_bright = torch.zeros(images.shape)
+    images = blend(images, img_bright, alpha)
+    return images
+def contrast_jitter(var, images):
+    """
+    Perfrom contrast jittering on the input images. The channels of images
+    should be in order BGR.
+    Args:
+        var (float): jitter ratio for contrast.
+        images (tensor): images to perform color jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+    Returns:
+        images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    alpha = 1.0 + np.random.uniform(-var, var)
+    img_gray = grayscale(images)
+    img_gray[:] = torch.mean(img_gray, dim=(1, 2, 3), keepdim=True)
+    images = blend(images, img_gray, alpha)
+    return images
+def saturation_jitter(var, images):
+    """
+    Perfrom saturation jittering on the input images. The channels of images
+    should be in order BGR.
+    Args:
+        var (float): jitter ratio for saturation.
+        images (tensor): images to perform color jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+    Returns:
+        images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    alpha = 1.0 + np.random.uniform(-var, var)
+    img_gray = grayscale(images)
+    images = blend(images, img_gray, alpha)
+    return images
+def lighting_jitter(images, alphastd, eigval, eigvec):
+    """
+    Perform AlexNet-style PCA jitter on the given images.
+    Args:
+        images (tensor): images to perform lighting jitter. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+        alphastd (float): jitter ratio for PCA jitter.
+        eigval (list): eigenvalues for PCA jitter.
+        eigvec (list[list]): eigenvectors for PCA jitter.
+    Returns:
+        out_images (tensor): the jittered images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    if alphastd == 0:
+        return images
+    # generate alpha1, alpha2, alpha3.
+    alpha = np.random.normal(0, alphastd, size=(1, 3))
+    eig_vec = np.array(eigvec)
+    eig_val = np.reshape(eigval, (1, 3))
+    rgb = np.sum(
+        eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0),
+        axis=1,
+    )
+    out_images = torch.zeros_like(images)
+    for idx in range(images.shape[1]):
+        out_images[:, idx] = images[:, idx] + rgb[2 - idx]
+    return out_images
+def color_normalization(images, mean, stddev):
+    """
+    Perform color nomration on the given images.
+    Args:
+        images (tensor): images to perform color normalization. Dimension is
+            `num frames` x `channel` x `height` x `width`.
+        mean (list): mean values for normalization.
+        stddev (list): standard deviations for normalization.
+    Returns:
+        out_images (tensor): the noramlized images, the dimension is
+            `num frames` x `channel` x `height` x `width`.
+    """
+    assert len(mean) == images.shape[1], "channel mean not computed properly"
+    assert (
+        len(stddev) == images.shape[1]
+    ), "channel stddev not computed properly"
+    out_images = torch.zeros_like(images)
+    for idx in range(len(mean)):
+        out_images[:, idx] = (images[:, idx] - mean[idx]) / stddev[idx]
+    return out_images

TimeSformer/timesformer/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,380 @@

+#!/usr/bin/env python3
+import logging
+import numpy as np
+import os
+import random
+import time
+from collections import defaultdict
+import cv2
+import torch
+from fvcore.common.file_io import PathManager
+from torch.utils.data.distributed import DistributedSampler
+from . import transform as transform
+logger = logging.getLogger(__name__)
+def retry_load_images(image_paths, retry=10, backend="pytorch"):
+    """
+    This function is to load images with support of retrying for failed load.
+    Args:
+        image_paths (list): paths of images needed to be loaded.
+        retry (int, optional): maximum time of loading retrying. Defaults to 10.
+        backend (str): `pytorch` or `cv2`.
+    Returns:
+        imgs (list): list of loaded images.
+    """
+    for i in range(retry):
+        imgs = []
+        for image_path in image_paths:
+            with PathManager.open(image_path, "rb") as f:
+                img_str = np.frombuffer(f.read(), np.uint8)
+                img = cv2.imdecode(img_str, flags=cv2.IMREAD_COLOR)
+            imgs.append(img)
+        if all(img is not None for img in imgs):
+            if backend == "pytorch":
+                imgs = torch.as_tensor(np.stack(imgs))
+            return imgs
+        else:
+            logger.warn("Reading failed. Will retry.")
+            time.sleep(1.0)
+        if i == retry - 1:
+            raise Exception("Failed to load images {}".format(image_paths))
+def get_sequence(center_idx, half_len, sample_rate, num_frames):
+    """
+    Sample frames among the corresponding clip.
+    Args:
+        center_idx (int): center frame idx for current clip
+        half_len (int): half of the clip length
+        sample_rate (int): sampling rate for sampling frames inside of the clip
+        num_frames (int): number of expected sampled frames
+    Returns:
+        seq (list): list of indexes of sampled frames in this clip.
+    """
+    seq = list(range(center_idx - half_len, center_idx + half_len, sample_rate))
+    for seq_idx in range(len(seq)):
+        if seq[seq_idx] < 0:
+            seq[seq_idx] = 0
+        elif seq[seq_idx] >= num_frames:
+            seq[seq_idx] = num_frames - 1
+    return seq
+def pack_pathway_output(cfg, frames):
+    """
+    Prepare output as a list of tensors. Each tensor corresponding to a
+    unique pathway.
+    Args:
+        frames (tensor): frames of images sampled from the video. The
+            dimension is `channel` x `num frames` x `height` x `width`.
+    Returns:
+        frame_list (list): list of tensors with the dimension of
+            `channel` x `num frames` x `height` x `width`.
+    """
+    if cfg.DATA.REVERSE_INPUT_CHANNEL:
+        frames = frames[[2, 1, 0], :, :, :]
+    if cfg.MODEL.ARCH in cfg.MODEL.SINGLE_PATHWAY_ARCH:
+        frame_list = [frames]
+    elif cfg.MODEL.ARCH in cfg.MODEL.MULTI_PATHWAY_ARCH:
+        fast_pathway = frames
+        # Perform temporal sampling from the fast pathway.
+        slow_pathway = torch.index_select(
+            frames,
+            1,
+            torch.linspace(
+                0, frames.shape[1] - 1, frames.shape[1] // cfg.SLOWFAST.ALPHA
+            ).long(),
+        )
+        frame_list = [slow_pathway, fast_pathway]
+    else:
+        raise NotImplementedError(
+            "Model arch {} is not in {}".format(
+                cfg.MODEL.ARCH,
+                cfg.MODEL.SINGLE_PATHWAY_ARCH + cfg.MODEL.MULTI_PATHWAY_ARCH,
+            )
+        )
+    return frame_list
+def spatial_sampling(
+    frames,
+    spatial_idx=-1,
+    min_scale=256,
+    max_scale=320,
+    crop_size=224,
+    random_horizontal_flip=True,
+    inverse_uniform_sampling=False,
+):
+    """
+    Perform spatial sampling on the given video frames. If spatial_idx is
+    -1, perform random scale, random crop, and random flip on the given
+    frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling
+    with the given spatial_idx.
+    Args:
+        frames (tensor): frames of images sampled from the video. The
+            dimension is `num frames` x `height` x `width` x `channel`.
+        spatial_idx (int): if -1, perform random spatial sampling. If 0, 1,
+            or 2, perform left, center, right crop if width is larger than
+            height, and perform top, center, buttom crop if height is larger
+            than width.
+        min_scale (int): the minimal size of scaling.
+        max_scale (int): the maximal size of scaling.
+        crop_size (int): the size of height and width used to crop the
+            frames.
+        inverse_uniform_sampling (bool): if True, sample uniformly in
+            [1 / max_scale, 1 / min_scale] and take a reciprocal to get the
+            scale. If False, take a uniform sample from [min_scale,
+            max_scale].
+    Returns:
+        frames (tensor): spatially sampled frames.
+    """
+    assert spatial_idx in [-1, 0, 1, 2]
+    if spatial_idx == -1:
+        frames, _ = transform.random_short_side_scale_jitter(
+            images=frames,
+            min_size=min_scale,
+            max_size=max_scale,
+            inverse_uniform_sampling=inverse_uniform_sampling,
+        )
+        frames, _ = transform.random_crop(frames, crop_size)
+        if random_horizontal_flip:
+            frames, _ = transform.horizontal_flip(0.5, frames)
+    else:
+        # The testing is deterministic and no jitter should be performed.
+        # min_scale, max_scale, and crop_size are expect to be the same.
+        #assert len({min_scale, max_scale, crop_size}) == 1
+        frames, _ = transform.random_short_side_scale_jitter(
+            frames, min_scale, max_scale
+        )
+        frames, _ = transform.uniform_crop(frames, crop_size, spatial_idx)
+    return frames
+def spatial_sampling_2crops(
+    frames,
+    spatial_idx=-1,
+    min_scale=256,
+    max_scale=320,
+    crop_size=224,
+    random_horizontal_flip=True,
+    inverse_uniform_sampling=False,
+):
+    """
+    Perform spatial sampling on the given video frames. If spatial_idx is
+    -1, perform random scale, random crop, and random flip on the given
+    frames. If spatial_idx is 0, 1, or 2, perform spatial uniform sampling
+    with the given spatial_idx.
+    Args:
+        frames (tensor): frames of images sampled from the video. The
+            dimension is `num frames` x `height` x `width` x `channel`.
+        spatial_idx (int): if -1, perform random spatial sampling. If 0, 1,
+            or 2, perform left, center, right crop if width is larger than
+            height, and perform top, center, buttom crop if height is larger
+            than width.
+        min_scale (int): the minimal size of scaling.
+        max_scale (int): the maximal size of scaling.
+        crop_size (int): the size of height and width used to crop the
+            frames.
+        inverse_uniform_sampling (bool): if True, sample uniformly in
+            [1 / max_scale, 1 / min_scale] and take a reciprocal to get the
+            scale. If False, take a uniform sample from [min_scale,
+            max_scale].
+    Returns:
+        frames (tensor): spatially sampled frames.
+    """
+    assert spatial_idx in [-1, 0, 1, 2]
+    if spatial_idx == -1:
+        frames, _ = transform.random_short_side_scale_jitter(
+            images=frames,
+            min_size=min_scale,
+            max_size=max_scale,
+            inverse_uniform_sampling=inverse_uniform_sampling,
+        )
+        frames, _ = transform.random_crop(frames, crop_size)
+        if random_horizontal_flip:
+            frames, _ = transform.horizontal_flip(0.5, frames)
+    else:
+        # The testing is deterministic and no jitter should be performed.
+        # min_scale, max_scale, and crop_size are expect to be the same.
+        #assert len({min_scale, max_scale, crop_size}) == 1
+        frames, _ = transform.random_short_side_scale_jitter(
+            frames, min_scale, max_scale
+        )
+        frames, _ = transform.uniform_crop_2crops(frames, crop_size, spatial_idx)
+    return frames
+def as_binary_vector(labels, num_classes):
+    """
+    Construct binary label vector given a list of label indices.
+    Args:
+        labels (list): The input label list.
+        num_classes (int): Number of classes of the label vector.
+    Returns:
+        labels (numpy array): the resulting binary vector.
+    """
+    label_arr = np.zeros((num_classes,))
+    for lbl in set(labels):
+        label_arr[lbl] = 1.0
+    return label_arr
+def aggregate_labels(label_list):
+    """
+    Join a list of label list.
+    Args:
+        labels (list): The input label list.
+    Returns:
+        labels (list): The joint list of all lists in input.
+    """
+    all_labels = []
+    for labels in label_list:
+        for l in labels:
+            all_labels.append(l)
+    return list(set(all_labels))
+def convert_to_video_level_labels(labels):
+    """
+    Aggregate annotations from all frames of a video to form video-level labels.
+    Args:
+        labels (list): The input label list.
+    Returns:
+        labels (list): Same as input, but with each label replaced by
+        a video-level one.
+    """
+    for video_id in range(len(labels)):
+        video_level_labels = aggregate_labels(labels[video_id])
+        for i in range(len(labels[video_id])):
+            labels[video_id][i] = video_level_labels
+    return labels
+def load_image_lists(frame_list_file, prefix="", return_list=False):
+    """
+    Load image paths and labels from a "frame list".
+    Each line of the frame list contains:
+    `original_vido_id video_id frame_id path labels`
+    Args:
+        frame_list_file (string): path to the frame list.
+        prefix (str): the prefix for the path.
+        return_list (bool): if True, return a list. If False, return a dict.
+    Returns:
+        image_paths (list or dict): list of list containing path to each frame.
+            If return_list is False, then return in a dict form.
+        labels (list or dict): list of list containing label of each frame.
+            If return_list is False, then return in a dict form.
+    """
+    image_paths = defaultdict(list)
+    labels = defaultdict(list)
+    with PathManager.open(frame_list_file, "r") as f:
+        assert f.readline().startswith("original_vido_id")
+        for line in f:
+            row = line.split()
+            # original_vido_id video_id frame_id path labels
+            assert len(row) == 5
+            video_name = row[0]
+            if prefix == "":
+                path = row[3]
+            else:
+                path = os.path.join(prefix, row[3])
+            image_paths[video_name].append(path)
+            frame_labels = row[-1].replace('"', "")
+            if frame_labels != "":
+                labels[video_name].append(
+                    [int(x) for x in frame_labels.split(",")]
+                )
+            else:
+                labels[video_name].append([])
+    if return_list:
+        keys = image_paths.keys()
+        image_paths = [image_paths[key] for key in keys]
+        labels = [labels[key] for key in keys]
+        return image_paths, labels
+    return dict(image_paths), dict(labels)
+def tensor_normalize(tensor, mean, std):
+    """
+    Normalize a given tensor by subtracting the mean and dividing the std.
+    Args:
+        tensor (tensor): tensor to normalize.
+        mean (tensor or list): mean value to subtract.
+        std (tensor or list): std to divide.
+    """
+    if tensor.dtype == torch.uint8:
+        tensor = tensor.float()
+        tensor = tensor / 255.0
+    if type(mean) == list:
+        mean = torch.tensor(mean)
+    if type(std) == list:
+        std = torch.tensor(std)
+    tensor = tensor - mean
+    tensor = tensor / std
+    return tensor
+def get_random_sampling_rate(long_cycle_sampling_rate, sampling_rate):
+    """
+    When multigrid training uses a fewer number of frames, we randomly
+    increase the sampling rate so that some clips cover the original span.
+    """
+    if long_cycle_sampling_rate > 0:
+        assert long_cycle_sampling_rate >= sampling_rate
+        return random.randint(sampling_rate, long_cycle_sampling_rate)
+    else:
+        return sampling_rate
+def revert_tensor_normalize(tensor, mean, std):
+    """
+    Revert normalization for a given tensor by multiplying by the std and adding the mean.
+    Args:
+        tensor (tensor): tensor to revert normalization.
+        mean (tensor or list): mean value to add.
+        std (tensor or list): std to multiply.
+    """
+    if type(mean) == list:
+        mean = torch.tensor(mean)
+    if type(std) == list:
+        std = torch.tensor(std)
+    tensor = tensor * std
+    tensor = tensor + mean
+    return tensor
+def create_sampler(dataset, shuffle, cfg):
+    """
+    Create sampler for the given dataset.
+    Args:
+        dataset (torch.utils.data.Dataset): the given dataset.
+        shuffle (bool): set to ``True`` to have the data reshuffled
+            at every epoch.
+        cfg (CfgNode): configs. Details can be found in
+            slowfast/config/defaults.py
+    Returns:
+        sampler (Sampler): the created sampler.
+    """
+    sampler = DistributedSampler(dataset) if cfg.NUM_GPUS > 1 else None
+    return sampler
+def loader_worker_init_fn(dataset):
+    """
+    Create init function passed to pytorch data loader.
+    Args:
+        dataset (torch.utils.data.Dataset): the given dataset.
+    """
+    return None

TimeSformer/timesformer/datasets/video_container.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import av
+def get_video_container(path_to_vid, multi_thread_decode=False, backend="pyav"):
+    """
+    Given the path to the video, return the pyav video container.
+    Args:
+        path_to_vid (str): path to the video.
+        multi_thread_decode (bool): if True, perform multi-thread decoding.
+        backend (str): decoder backend, options include `pyav` and
+            `torchvision`, default is `pyav`.
+    Returns:
+        container (container): video container.
+    """
+    if backend == "torchvision":
+        with open(path_to_vid, "rb") as fp:
+            container = fp.read()
+        return container
+    elif backend == "pyav":
+        #try:
+        container = av.open(path_to_vid)
+        if multi_thread_decode:
+            # Enable multiple threads for decoding.
+            container.streams.video[0].thread_type = "AUTO"
+        #except:
+        #  container = None
+        return container
+    else:
+        raise NotImplementedError("Unknown backend {}".format(backend))

TimeSformer/timesformer/models/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .build import MODEL_REGISTRY, build_model  # noqa
+from .custom_video_model_builder import *  # noqa
+from .video_model_builder import ResNet, SlowFast # noqa

TimeSformer/timesformer/models/batchnorm_helper.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""BatchNorm (BN) utility functions and custom batch-size BN implementations"""
+from functools import partial
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.autograd.function import Function
+import timesformer.utils.distributed as du
+def get_norm(cfg):
+    """
+    Args:
+        cfg (CfgNode): model building configs, details are in the comments of
+            the config file.
+    Returns:
+        nn.Module: the normalization layer.
+    """
+    if cfg.BN.NORM_TYPE == "batchnorm":
+        return nn.BatchNorm3d
+    elif cfg.BN.NORM_TYPE == "sub_batchnorm":
+        return partial(SubBatchNorm3d, num_splits=cfg.BN.NUM_SPLITS)
+    elif cfg.BN.NORM_TYPE == "sync_batchnorm":
+        return partial(
+            NaiveSyncBatchNorm3d, num_sync_devices=cfg.BN.NUM_SYNC_DEVICES
+        )
+    else:
+        raise NotImplementedError(
+            "Norm type {} is not supported".format(cfg.BN.NORM_TYPE)
+        )
+class SubBatchNorm3d(nn.Module):
+    """
+    The standard BN layer computes stats across all examples in a GPU. In some
+    cases it is desirable to compute stats across only a subset of examples
+    (e.g., in multigrid training https://arxiv.org/abs/1912.00998).
+    SubBatchNorm3d splits the batch dimension into N splits, and run BN on
+    each of them separately (so that the stats are computed on each subset of
+    examples (1/N of batch) independently. During evaluation, it aggregates
+    the stats from all splits into one BN.
+    """
+    def __init__(self, num_splits, **args):
+        """
+        Args:
+            num_splits (int): number of splits.
+            args (list): other arguments.
+        """
+        super(SubBatchNorm3d, self).__init__()
+        self.num_splits = num_splits
+        num_features = args["num_features"]
+        # Keep only one set of weight and bias.
+        if args.get("affine", True):
+            self.affine = True
+            args["affine"] = False
+            self.weight = torch.nn.Parameter(torch.ones(num_features))
+            self.bias = torch.nn.Parameter(torch.zeros(num_features))
+        else:
+            self.affine = False
+        self.bn = nn.BatchNorm3d(**args)
+        args["num_features"] = num_features * num_splits
+        self.split_bn = nn.BatchNorm3d(**args)
+    def _get_aggregated_mean_std(self, means, stds, n):
+        """
+        Calculate the aggregated mean and stds.
+        Args:
+            means (tensor): mean values.
+            stds (tensor): standard deviations.
+            n (int): number of sets of means and stds.
+        """
+        mean = means.view(n, -1).sum(0) / n
+        std = (
+            stds.view(n, -1).sum(0) / n
+            + ((means.view(n, -1) - mean) ** 2).view(n, -1).sum(0) / n
+        )
+        return mean.detach(), std.detach()
+    def aggregate_stats(self):
+        """
+        Synchronize running_mean, and running_var. Call this before eval.
+        """
+        if self.split_bn.track_running_stats:
+            (
+                self.bn.running_mean.data,
+                self.bn.running_var.data,
+            ) = self._get_aggregated_mean_std(
+                self.split_bn.running_mean,
+                self.split_bn.running_var,
+                self.num_splits,
+            )
+    def forward(self, x):
+        if self.training:
+            n, c, t, h, w = x.shape
+            x = x.view(n // self.num_splits, c * self.num_splits, t, h, w)
+            x = self.split_bn(x)
+            x = x.view(n, c, t, h, w)
+        else:
+            x = self.bn(x)
+        if self.affine:
+            x = x * self.weight.view((-1, 1, 1, 1))
+            x = x + self.bias.view((-1, 1, 1, 1))
+        return x
+class GroupGather(Function):
+    """
+    GroupGather performs all gather on each of the local process/ GPU groups.
+    """
+    @staticmethod
+    def forward(ctx, input, num_sync_devices, num_groups):
+        """
+        Perform forwarding, gathering the stats across different process/ GPU
+        group.
+        """
+        ctx.num_sync_devices = num_sync_devices
+        ctx.num_groups = num_groups
+        input_list = [
+            torch.zeros_like(input) for k in range(du.get_local_size())
+        ]
+        dist.all_gather(
+            input_list, input, async_op=False, group=du._LOCAL_PROCESS_GROUP
+        )
+        inputs = torch.stack(input_list, dim=0)
+        if num_groups > 1:
+            rank = du.get_local_rank()
+            group_idx = rank // num_sync_devices
+            inputs = inputs[
+                group_idx
+                * num_sync_devices : (group_idx + 1)
+                * num_sync_devices
+            ]
+        inputs = torch.sum(inputs, dim=0)
+        return inputs
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        Perform backwarding, gathering the gradients across different process/ GPU
+        group.
+        """
+        grad_output_list = [
+            torch.zeros_like(grad_output) for k in range(du.get_local_size())
+        ]
+        dist.all_gather(
+            grad_output_list,
+            grad_output,
+            async_op=False,
+            group=du._LOCAL_PROCESS_GROUP,
+        )
+        grads = torch.stack(grad_output_list, dim=0)
+        if ctx.num_groups > 1:
+            rank = du.get_local_rank()
+            group_idx = rank // ctx.num_sync_devices
+            grads = grads[
+                group_idx
+                * ctx.num_sync_devices : (group_idx + 1)
+                * ctx.num_sync_devices
+            ]
+        grads = torch.sum(grads, dim=0)
+        return grads, None, None
+class NaiveSyncBatchNorm3d(nn.BatchNorm3d):
+    def __init__(self, num_sync_devices, **args):
+        """
+        Naive version of Synchronized 3D BatchNorm.
+        Args:
+            num_sync_devices (int): number of device to sync.
+            args (list): other arguments.
+        """
+        self.num_sync_devices = num_sync_devices
+        if self.num_sync_devices > 0:
+            assert du.get_local_size() % self.num_sync_devices == 0, (
+                du.get_local_size(),
+                self.num_sync_devices,
+            )
+            self.num_groups = du.get_local_size() // self.num_sync_devices
+        else:
+            self.num_sync_devices = du.get_local_size()
+            self.num_groups = 1
+        super(NaiveSyncBatchNorm3d, self).__init__(**args)
+    def forward(self, input):
+        if du.get_local_size() == 1 or not self.training:
+            return super().forward(input)
+        assert input.shape[0] > 0, "SyncBatchNorm does not support empty inputs"
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2, 3, 4])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3, 4])
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = GroupGather.apply(vec, self.num_sync_devices, self.num_groups) * (
+            1.0 / self.num_sync_devices
+        )
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1, 1)
+        return input * scale + bias

TimeSformer/timesformer/models/build.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Model construction functions."""
+import torch
+from fvcore.common.registry import Registry
+MODEL_REGISTRY = Registry("MODEL")
+MODEL_REGISTRY.__doc__ = """
+Registry for video model.
+The registered object will be called with `obj(cfg)`.
+The call should return a `torch.nn.Module` object.
+"""
+def build_model(cfg, gpu_id=None):
+    """
+    Builds the video model.
+    Args:
+        cfg (configs): configs that contains the hyper-parameters to build the
+        backbone. Details can be seen in slowfast/config/defaults.py.
+        gpu_id (Optional[int]): specify the gpu index to build model.
+    """
+    if torch.cuda.is_available():
+        assert (
+            cfg.NUM_GPUS <= torch.cuda.device_count()
+        ), "Cannot use more GPU devices than available"
+    else:
+        assert (
+            cfg.NUM_GPUS == 0
+        ), "Cuda is not available. Please set `NUM_GPUS: 0 for running on CPUs."
+    # Construct the model
+    name = cfg.MODEL.MODEL_NAME
+    model = MODEL_REGISTRY.get(name)(cfg)
+    if cfg.NUM_GPUS:
+        if gpu_id is None:
+            # Determine the GPU used by the current process
+            cur_device = torch.cuda.current_device()
+        else:
+            cur_device = gpu_id
+        # Transfer the model to the current GPU device
+        model = model.cuda(device=cur_device)
+    # Use multi-process data parallel model in the multi-gpu setting
+    if cfg.NUM_GPUS > 1:
+        # Make model replica operate on the current device
+        model = torch.nn.parallel.DistributedDataParallel(
+            module=model, device_ids=[cur_device], output_device=cur_device
+        )
+    return model

TimeSformer/timesformer/models/conv2d_same.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright 2020 Ross Wightman
+# Conv2d w/ Same Padding
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Optional
+import math
+from typing import List, Tuple
+#from .padding import pad_same, get_padding_value
+# Dynamically pad input x with 'SAME' padding for conv with specified args
+def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0):
+    ih, iw = x.size()[-2:]
+    pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1])
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value)
+    return x
+# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
+def get_same_padding(x: int, k: int, s: int, d: int):
+    return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
+def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
+    dynamic = False
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == 'same':
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if is_static_pad(kernel_size, **kwargs):
+                # static case, no extra overhead
+                padding = get_padding(kernel_size, **kwargs)
+            else:
+                # dynamic 'SAME' padding, has runtime/GPU memory overhead
+                padding = 0
+                dynamic = True
+        elif padding == 'valid':
+            # 'VALID' padding, same as padding=0
+            padding = 0
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = get_padding(kernel_size, **kwargs)
+    return padding, dynamic
+def conv2d_same(
+        x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1),
+        padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1):
+    x = pad_same(x, weight.shape[-2:], stride, dilation)
+    return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
+class Conv2dSame(nn.Conv2d):
+    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True):
+        super(Conv2dSame, self).__init__(
+            in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+    def forward(self, x):
+        return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
+    padding = kwargs.pop('padding', '')
+    kwargs.setdefault('bias', False)
+    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
+    if is_dynamic:
+        return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
+    else:
+        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)

TimeSformer/timesformer/models/custom_video_model_builder.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2	+
3	+
4	+ """A More Flexible Video models."""

TimeSformer/timesformer/models/features.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# Copyright 2020 Ross Wightman
+from collections import OrderedDict, defaultdict
+from copy import deepcopy
+from functools import partial
+from typing import Dict, List, Tuple
+import torch
+import torch.nn as nn
+class FeatureInfo:
+    def __init__(self, feature_info: List[Dict], out_indices: Tuple[int]):
+        prev_reduction = 1
+        for fi in feature_info:
+            # sanity check the mandatory fields, there may be additional fields depending on the model
+            assert 'num_chs' in fi and fi['num_chs'] > 0
+            assert 'reduction' in fi and fi['reduction'] >= prev_reduction
+            prev_reduction = fi['reduction']
+            assert 'module' in fi
+        self.out_indices = out_indices
+        self.info = feature_info
+    def from_other(self, out_indices: Tuple[int]):
+        return FeatureInfo(deepcopy(self.info), out_indices)
+    def get(self, key, idx=None):
+        """ Get value by key at specified index (indices)
+        if idx == None, returns value for key at each output index
+        if idx is an integer, return value for that feature module index (ignoring output indices)
+        if idx is a list/tupple, return value for each module index (ignoring output indices)
+        """
+        if idx is None:
+            return [self.info[i][key] for i in self.out_indices]
+        if isinstance(idx, (tuple, list)):
+            return [self.info[i][key] for i in idx]
+        else:
+            return self.info[idx][key]
+    def get_dicts(self, keys=None, idx=None):
+        """ return info dicts for specified keys (or all if None) at specified indices (or out_indices if None)
+        """
+        if idx is None:
+            if keys is None:
+                return [self.info[i] for i in self.out_indices]
+            else:
+                return [{k: self.info[i][k] for k in keys} for i in self.out_indices]
+        if isinstance(idx, (tuple, list)):
+            return [self.info[i] if keys is None else {k: self.info[i][k] for k in keys} for i in idx]
+        else:
+            return self.info[idx] if keys is None else {k: self.info[idx][k] for k in keys}
+    def channels(self, idx=None):
+        """ feature channels accessor
+        """
+        return self.get('num_chs', idx)
+    def reduction(self, idx=None):
+        """ feature reduction (output stride) accessor
+        """
+        return self.get('reduction', idx)
+    def module_name(self, idx=None):
+        """ feature module name accessor
+        """
+        return self.get('module', idx)
+    def __getitem__(self, item):
+        return self.info[item]
+    def __len__(self):
+        return len(self.info)
+class FeatureHooks:
+    """ Feature Hook Helper
+    This module helps with the setup and extraction of hooks for extracting features from
+    internal nodes in a model by node name. This works quite well in eager Python but needs
+    redesign for torcscript.
+    """
+    def __init__(self, hooks, named_modules, out_map=None, default_hook_type='forward'):
+        # setup feature hooks
+        modules = {k: v for k, v in named_modules}
+        for i, h in enumerate(hooks):
+            hook_name = h['module']
+            m = modules[hook_name]
+            hook_id = out_map[i] if out_map else hook_name
+            hook_fn = partial(self._collect_output_hook, hook_id)
+            hook_type = h['hook_type'] if 'hook_type' in h else default_hook_type
+            if hook_type == 'forward_pre':
+                m.register_forward_pre_hook(hook_fn)
+            elif hook_type == 'forward':
+                m.register_forward_hook(hook_fn)
+            else:
+                assert False, "Unsupported hook type"
+        self._feature_outputs = defaultdict(OrderedDict)
+    def _collect_output_hook(self, hook_id, *args):
+        x = args[-1]  # tensor we want is last argument, output for fwd, input for fwd_pre
+        if isinstance(x, tuple):
+            x = x[0]  # unwrap input tuple
+        self._feature_outputs[x.device][hook_id] = x
+    def get_output(self, device) -> Dict[str, torch.tensor]:
+        output = self._feature_outputs[device]
+        self._feature_outputs[device] = OrderedDict()  # clear after reading
+        return output
+def _module_list(module, flatten_sequential=False):
+    # a yield/iter would be better for this but wouldn't be compatible with torchscript
+    ml = []
+    for name, module in module.named_children():
+        if flatten_sequential and isinstance(module, nn.Sequential):
+            # first level of Sequential containers is flattened into containing model
+            for child_name, child_module in module.named_children():
+                combined = [name, child_name]
+                ml.append(('_'.join(combined), '.'.join(combined), child_module))
+        else:
+            ml.append((name, name, module))
+    return ml
+def _get_feature_info(net, out_indices):
+    feature_info = getattr(net, 'feature_info')
+    if isinstance(feature_info, FeatureInfo):
+        return feature_info.from_other(out_indices)
+    elif isinstance(feature_info, (list, tuple)):
+        return FeatureInfo(net.feature_info, out_indices)
+    else:
+        assert False, "Provided feature_info is not valid"
+def _get_return_layers(feature_info, out_map):
+    module_names = feature_info.module_name()
+    return_layers = {}
+    for i, name in enumerate(module_names):
+        return_layers[name] = out_map[i] if out_map is not None else feature_info.out_indices[i]
+    return return_layers
+class FeatureDictNet(nn.ModuleDict):
+    """ Feature extractor with OrderedDict return
+    Wrap a model and extract features as specified by the out indices, the network is
+    partially re-built from contained modules.
+    There is a strong assumption that the modules have been registered into the model in the same
+    order as they are used. There should be no reuse of the same nn.Module more than once, including
+    trivial modules like `self.relu = nn.ReLU`.
+    Only submodules that are directly assigned to the model class (`model.feature1`) or at most
+    one Sequential container deep (`model.features.1`, with flatten_sequent=True) can be captured.
+    All Sequential containers that are directly assigned to the original model will have their
+    modules assigned to this module with the name `model.features.1` being changed to `model.features_1`
+    Arguments:
+        model (nn.Module): model from which we will extract the features
+        out_indices (tuple[int]): model output indices to extract features for
+        out_map (sequence): list or tuple specifying desired return id for each out index,
+            otherwise str(index) is used
+        feature_concat (bool): whether to concatenate intermediate features that are lists or tuples
+            vs select element [0]
+        flatten_sequential (bool): whether to flatten sequential modules assigned to model
+    """
+    def __init__(
+            self, model,
+            out_indices=(0, 1, 2, 3, 4), out_map=None, feature_concat=False, flatten_sequential=False):
+        super(FeatureDictNet, self).__init__()
+        self.feature_info = _get_feature_info(model, out_indices)
+        self.concat = feature_concat
+        self.return_layers = {}
+        return_layers = _get_return_layers(self.feature_info, out_map)
+        modules = _module_list(model, flatten_sequential=flatten_sequential)
+        remaining = set(return_layers.keys())
+        layers = OrderedDict()
+        for new_name, old_name, module in modules:
+            layers[new_name] = module
+            if old_name in remaining:
+                # return id has to be consistently str type for torchscript
+                self.return_layers[new_name] = str(return_layers[old_name])
+                remaining.remove(old_name)
+            if not remaining:
+                break
+        assert not remaining and len(self.return_layers) == len(return_layers), \
+            f'Return layers ({remaining}) are not present in model'
+        self.update(layers)
+    def _collect(self, x) -> (Dict[str, torch.Tensor]):
+        out = OrderedDict()
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_id = self.return_layers[name]
+                if isinstance(x, (tuple, list)):
+                    # If model tap is a tuple or list, concat or select first element
+                    # FIXME this may need to be more generic / flexible for some nets
+                    out[out_id] = torch.cat(x, 1) if self.concat else x[0]
+                else:
+                    out[out_id] = x
+        return out
+    def forward(self, x) -> Dict[str, torch.Tensor]:
+        return self._collect(x)
+class FeatureListNet(FeatureDictNet):
+    """ Feature extractor with list return
+    See docstring for FeatureDictNet above, this class exists only to appease Torchscript typing constraints.
+    In eager Python we could have returned List[Tensor] vs Dict[id, Tensor] based on a member bool.
+    """
+    def __init__(
+            self, model,
+            out_indices=(0, 1, 2, 3, 4), out_map=None, feature_concat=False, flatten_sequential=False):
+        super(FeatureListNet, self).__init__(
+            model, out_indices=out_indices, out_map=out_map, feature_concat=feature_concat,
+            flatten_sequential=flatten_sequential)
+    def forward(self, x) -> (List[torch.Tensor]):
+        return list(self._collect(x).values())
+class FeatureHookNet(nn.ModuleDict):
+    """ FeatureHookNet
+    Wrap a model and extract features specified by the out indices using forward/forward-pre hooks.
+    If `no_rewrite` is True, features are extracted via hooks without modifying the underlying
+    network in any way.
+    If `no_rewrite` is False, the model will be re-written as in the
+    FeatureList/FeatureDict case by folding first to second (Sequential only) level modules into this one.
+    FIXME this does not currently work with Torchscript, see FeatureHooks class
+    """
+    def __init__(
+            self, model,
+            out_indices=(0, 1, 2, 3, 4), out_map=None, out_as_dict=False, no_rewrite=False,
+            feature_concat=False, flatten_sequential=False, default_hook_type='forward'):
+        super(FeatureHookNet, self).__init__()
+        assert not torch.jit.is_scripting()
+        self.feature_info = _get_feature_info(model, out_indices)
+        self.out_as_dict = out_as_dict
+        layers = OrderedDict()
+        hooks = []
+        if no_rewrite:
+            assert not flatten_sequential
+            if hasattr(model, 'reset_classifier'):  # make sure classifier is removed?
+                model.reset_classifier(0)
+            layers['body'] = model
+            hooks.extend(self.feature_info.get_dicts())
+        else:
+            modules = _module_list(model, flatten_sequential=flatten_sequential)
+            remaining = {f['module']: f['hook_type'] if 'hook_type' in f else default_hook_type
+                         for f in self.feature_info.get_dicts()}
+            for new_name, old_name, module in modules:
+                layers[new_name] = module
+                for fn, fm in module.named_modules(prefix=old_name):
+                    if fn in remaining:
+                        hooks.append(dict(module=fn, hook_type=remaining[fn]))
+                        del remaining[fn]
+                if not remaining:
+                    break
+            assert not remaining, f'Return layers ({remaining}) are not present in model'
+        self.update(layers)
+        self.hooks = FeatureHooks(hooks, model.named_modules(), out_map=out_map)
+    def forward(self, x):
+        for name, module in self.items():
+            x = module(x)
+        out = self.hooks.get_output(x.device)
+        return out if self.out_as_dict else list(out.values())

TimeSformer/timesformer/models/head_helper.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""ResNe(X)t Head helper."""
+import torch
+import torch.nn as nn
+class ResNetBasicHead(nn.Module):
+    """
+    ResNe(X)t 3D head.
+    This layer performs a fully-connected projection during training, when the
+    input size is 1x1x1. It performs a convolutional projection during testing
+    when the input size is larger than 1x1x1. If the inputs are from multiple
+    different pathways, the inputs will be concatenated after pooling.
+    """
+    def __init__(
+        self,
+        dim_in,
+        num_classes,
+        pool_size,
+        dropout_rate=0.0,
+        act_func="softmax",
+    ):
+        """
+        The `__init__` method of any subclass should also contain these
+            arguments.
+        ResNetBasicHead takes p pathways as input where p in [1, infty].
+        Args:
+            dim_in (list): the list of channel dimensions of the p inputs to the
+                ResNetHead.
+            num_classes (int): the channel dimensions of the p outputs to the
+                ResNetHead.
+            pool_size (list): the list of kernel sizes of p spatial temporal
+                poolings, temporal pool kernel size, spatial pool kernel size,
+                spatial pool kernel size in order.
+            dropout_rate (float): dropout rate. If equal to 0.0, perform no
+                dropout.
+            act_func (string): activation function to use. 'softmax': applies
+                softmax on the output. 'sigmoid': applies sigmoid on the output.
+        """
+        super(ResNetBasicHead, self).__init__()
+        assert (
+            len({len(pool_size), len(dim_in)}) == 1
+        ), "pathway dimensions are not consistent."
+        self.num_pathways = len(pool_size)
+        for pathway in range(self.num_pathways):
+            if pool_size[pathway] is None:
+                avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+            else:
+                avg_pool = nn.AvgPool3d(pool_size[pathway], stride=1)
+            self.add_module("pathway{}_avgpool".format(pathway), avg_pool)
+        if dropout_rate > 0.0:
+            self.dropout = nn.Dropout(dropout_rate)
+        # Perform FC in a fully convolutional manner. The FC layer will be
+        # initialized with a different std comparing to convolutional layers.
+        self.projection = nn.Linear(sum(dim_in), num_classes, bias=True)
+        # Softmax for evaluation and testing.
+        if act_func == "softmax":
+            self.act = nn.Softmax(dim=4)
+        elif act_func == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            raise NotImplementedError(
+                "{} is not supported as an activation"
+                "function.".format(act_func)
+            )
+    def forward(self, inputs):
+        assert (
+            len(inputs) == self.num_pathways
+        ), "Input tensor does not contain {} pathway".format(self.num_pathways)
+        pool_out = []
+        for pathway in range(self.num_pathways):
+            m = getattr(self, "pathway{}_avgpool".format(pathway))
+            pool_out.append(m(inputs[pathway]))
+        x = torch.cat(pool_out, 1)
+        # (N, C, T, H, W) -> (N, T, H, W, C).
+        x = x.permute((0, 2, 3, 4, 1))
+        # Perform dropout.
+        if hasattr(self, "dropout"):
+            x = self.dropout(x)
+        x = self.projection(x)
+        # Performs fully convlutional inference.
+        if not self.training:
+            x = self.act(x)
+            x = x.mean([1, 2, 3])
+        x = x.view(x.shape[0], -1)
+        return x
+class X3DHead(nn.Module):
+    """
+    X3D head.
+    This layer performs a fully-connected projection during training, when the
+    input size is 1x1x1. It performs a convolutional projection during testing
+    when the input size is larger than 1x1x1. If the inputs are from multiple
+    different pathways, the inputs will be concatenated after pooling.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_inner,
+        dim_out,
+        num_classes,
+        pool_size,
+        dropout_rate=0.0,
+        act_func="softmax",
+        inplace_relu=True,
+        eps=1e-5,
+        bn_mmt=0.1,
+        norm_module=nn.BatchNorm3d,
+        bn_lin5_on=False,
+    ):
+        """
+        The `__init__` method of any subclass should also contain these
+            arguments.
+        X3DHead takes a 5-dim feature tensor (BxCxTxHxW) as input.
+        Args:
+            dim_in (float): the channel dimension C of the input.
+            num_classes (int): the channel dimensions of the output.
+            pool_size (float): a single entry list of kernel size for
+                spatiotemporal pooling for the TxHxW dimensions.
+            dropout_rate (float): dropout rate. If equal to 0.0, perform no
+                dropout.
+            act_func (string): activation function to use. 'softmax': applies
+                softmax on the output. 'sigmoid': applies sigmoid on the output.
+            inplace_relu (bool): if True, calculate the relu on the original
+                input without allocating new memory.
+            eps (float): epsilon for batch norm.
+            bn_mmt (float): momentum for batch norm. Noted that BN momentum in
+                PyTorch = 1 - BN momentum in Caffe2.
+            norm_module (nn.Module): nn.Module for the normalization layer. The
+                default is nn.BatchNorm3d.
+            bn_lin5_on (bool): if True, perform normalization on the features
+                before the classifier.
+        """
+        super(X3DHead, self).__init__()
+        self.pool_size = pool_size
+        self.dropout_rate = dropout_rate
+        self.num_classes = num_classes
+        self.act_func = act_func
+        self.eps = eps
+        self.bn_mmt = bn_mmt
+        self.inplace_relu = inplace_relu
+        self.bn_lin5_on = bn_lin5_on
+        self._construct_head(dim_in, dim_inner, dim_out, norm_module)
+    def _construct_head(self, dim_in, dim_inner, dim_out, norm_module):
+        self.conv_5 = nn.Conv3d(
+            dim_in,
+            dim_inner,
+            kernel_size=(1, 1, 1),
+            stride=(1, 1, 1),
+            padding=(0, 0, 0),
+            bias=False,
+        )
+        self.conv_5_bn = norm_module(
+            num_features=dim_inner, eps=self.eps, momentum=self.bn_mmt
+        )
+        self.conv_5_relu = nn.ReLU(self.inplace_relu)
+        if self.pool_size is None:
+            self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+        else:
+            self.avg_pool = nn.AvgPool3d(self.pool_size, stride=1)
+        self.lin_5 = nn.Conv3d(
+            dim_inner,
+            dim_out,
+            kernel_size=(1, 1, 1),
+            stride=(1, 1, 1),
+            padding=(0, 0, 0),
+            bias=False,
+        )
+        if self.bn_lin5_on:
+            self.lin_5_bn = norm_module(
+                num_features=dim_out, eps=self.eps, momentum=self.bn_mmt
+            )
+        self.lin_5_relu = nn.ReLU(self.inplace_relu)
+        if self.dropout_rate > 0.0:
+            self.dropout = nn.Dropout(self.dropout_rate)
+        # Perform FC in a fully convolutional manner. The FC layer will be
+        # initialized with a different std comparing to convolutional layers.
+        self.projection = nn.Linear(dim_out, self.num_classes, bias=True)
+        # Softmax for evaluation and testing.
+        if self.act_func == "softmax":
+            self.act = nn.Softmax(dim=4)
+        elif self.act_func == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            raise NotImplementedError(
+                "{} is not supported as an activation"
+                "function.".format(self.act_func)
+            )
+    def forward(self, inputs):
+        # In its current design the X3D head is only useable for a single
+        # pathway input.
+        assert len(inputs) == 1, "Input tensor does not contain 1 pathway"
+        x = self.conv_5(inputs[0])
+        x = self.conv_5_bn(x)
+        x = self.conv_5_relu(x)
+        x = self.avg_pool(x)
+        x = self.lin_5(x)
+        if self.bn_lin5_on:
+            x = self.lin_5_bn(x)
+        x = self.lin_5_relu(x)
+        # (N, C, T, H, W) -> (N, T, H, W, C).
+        x = x.permute((0, 2, 3, 4, 1))
+        # Perform dropout.
+        if hasattr(self, "dropout"):
+            x = self.dropout(x)
+        x = self.projection(x)
+        # Performs fully convlutional inference.
+        if not self.training:
+            x = self.act(x)
+            x = x.mean([1, 2, 3])
+        x = x.view(x.shape[0], -1)
+        return x

TimeSformer/timesformer/models/helpers.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2020 Ross Wightman
+# Modified model creation / weight loading / state_dict helpers
+import logging
+import os
+import math
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Callable
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+import torch.nn.functional as F
+from timesformer.models.features import FeatureListNet, FeatureDictNet, FeatureHookNet
+from timesformer.models.conv2d_same import Conv2dSame
+from timesformer.models.linear import Linear
+_logger = logging.getLogger(__name__)
+def load_state_dict(checkpoint_path, use_ema=False):
+    if checkpoint_path and os.path.isfile(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        state_dict_key = 'state_dict'
+        if isinstance(checkpoint, dict):
+            if use_ema and 'state_dict_ema' in checkpoint:
+                state_dict_key = 'state_dict_ema'
+        if state_dict_key and state_dict_key in checkpoint:
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint[state_dict_key].items():
+                # strip `module.` prefix
+                name = k[7:] if k.startswith('module') else k
+                new_state_dict[name] = v
+            state_dict = new_state_dict
+        elif 'model_state' in checkpoint:
+            state_dict_key = 'model_state'
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint[state_dict_key].items():
+                # strip `model.` prefix
+                name = k[6:] if k.startswith('model') else k
+                new_state_dict[name] = v
+            state_dict = new_state_dict
+        else:
+            state_dict = checkpoint
+        _logger.info("Loaded {} from checkpoint '{}'".format(state_dict_key, checkpoint_path))
+        return state_dict
+    else:
+        _logger.error("No checkpoint found at '{}'".format(checkpoint_path))
+        raise FileNotFoundError()
+def load_checkpoint(model, checkpoint_path, use_ema=False, strict=True):
+    state_dict = load_state_dict(checkpoint_path, use_ema)
+    model.load_state_dict(state_dict, strict=strict)
+def resume_checkpoint(model, checkpoint_path, optimizer=None, loss_scaler=None, log_info=True):
+    resume_epoch = None
+    if os.path.isfile(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+            if log_info:
+                _logger.info('Restoring model state from checkpoint...')
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint['state_dict'].items():
+                name = k[7:] if k.startswith('module') else k
+                new_state_dict[name] = v
+            model.load_state_dict(new_state_dict)
+            if optimizer is not None and 'optimizer' in checkpoint:
+                if log_info:
+                    _logger.info('Restoring optimizer state from checkpoint...')
+                optimizer.load_state_dict(checkpoint['optimizer'])
+            if loss_scaler is not None and loss_scaler.state_dict_key in checkpoint:
+                if log_info:
+                    _logger.info('Restoring AMP loss scaler state from checkpoint...')
+                loss_scaler.load_state_dict(checkpoint[loss_scaler.state_dict_key])
+            if 'epoch' in checkpoint:
+                resume_epoch = checkpoint['epoch']
+                if 'version' in checkpoint and checkpoint['version'] > 1:
+                    resume_epoch += 1  # start at the next epoch, old checkpoints incremented before save
+            if log_info:
+                _logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch']))
+        else:
+            model.load_state_dict(checkpoint)
+            if log_info:
+                _logger.info("Loaded checkpoint '{}'".format(checkpoint_path))
+        return resume_epoch
+    else:
+        _logger.error("No checkpoint found at '{}'".format(checkpoint_path))
+        raise FileNotFoundError()
+def load_pretrained(model, cfg=None, num_classes=1000, in_chans=3, filter_fn=None, img_size=224, num_frames=8, num_patches=196, attention_type='divided_space_time', pretrained_model="", strict=True):
+    if cfg is None:
+        cfg = getattr(model, 'default_cfg')
+    if cfg is None or 'url' not in cfg or not cfg['url']:
+        _logger.warning("Pretrained model URL is invalid, using random initialization.")
+        return
+    if len(pretrained_model) == 0:
+       state_dict = model_zoo.load_url(cfg['url'], progress=False, map_location='cpu')
+    else:
+       try:
+         state_dict = load_state_dict(pretrained_model)['model']
+       except:
+         state_dict = load_state_dict(pretrained_model)
+    if filter_fn is not None:
+        state_dict = filter_fn(state_dict)
+    if in_chans == 1:
+        conv1_name = cfg['first_conv']
+        _logger.info('Converting first conv (%s) pretrained weights from 3 to 1 channel' % conv1_name)
+        conv1_weight = state_dict[conv1_name + '.weight']
+        conv1_type = conv1_weight.dtype
+        conv1_weight = conv1_weight.float()
+        O, I, J, K = conv1_weight.shape
+        if I > 3:
+            assert conv1_weight.shape[1] % 3 == 0
+            # For models with space2depth stems
+            conv1_weight = conv1_weight.reshape(O, I // 3, 3, J, K)
+            conv1_weight = conv1_weight.sum(dim=2, keepdim=False)
+        else:
+            conv1_weight = conv1_weight.sum(dim=1, keepdim=True)
+        conv1_weight = conv1_weight.to(conv1_type)
+        state_dict[conv1_name + '.weight'] = conv1_weight
+    elif in_chans != 3:
+        conv1_name = cfg['first_conv']
+        conv1_weight = state_dict[conv1_name + '.weight']
+        conv1_type = conv1_weight.dtype
+        conv1_weight = conv1_weight.float()
+        O, I, J, K = conv1_weight.shape
+        if I != 3:
+            _logger.warning('Deleting first conv (%s) from pretrained weights.' % conv1_name)
+            del state_dict[conv1_name + '.weight']
+            strict = False
+        else:
+            _logger.info('Repeating first conv (%s) weights in channel dim.' % conv1_name)
+            repeat = int(math.ceil(in_chans / 3))
+            conv1_weight = conv1_weight.repeat(1, repeat, 1, 1)[:, :in_chans, :, :]
+            conv1_weight *= (3 / float(in_chans))
+            conv1_weight = conv1_weight.to(conv1_type)
+            state_dict[conv1_name + '.weight'] = conv1_weight
+    classifier_name = cfg['classifier']
+    if num_classes == 1000 and cfg['num_classes'] == 1001:
+        # special case for imagenet trained models with extra background class in pretrained weights
+        classifier_weight = state_dict[classifier_name + '.weight']
+        state_dict[classifier_name + '.weight'] = classifier_weight[1:]
+        classifier_bias = state_dict[classifier_name + '.bias']
+        state_dict[classifier_name + '.bias'] = classifier_bias[1:]
+    elif num_classes != state_dict[classifier_name + '.weight'].size(0):
+        #print('Removing the last fully connected layer due to dimensions mismatch ('+str(num_classes)+ ' != '+str(state_dict[classifier_name + '.weight'].size(0))+').', flush=True)
+        # completely discard fully connected for all other differences between pretrained and created model
+        del state_dict[classifier_name + '.weight']
+        del state_dict[classifier_name + '.bias']
+        strict = False
+    ## Resizing the positional embeddings in case they don't match
+    if num_patches + 1 != state_dict['pos_embed'].size(1):
+        pos_embed = state_dict['pos_embed']
+        cls_pos_embed = pos_embed[0,0,:].unsqueeze(0).unsqueeze(1)
+        other_pos_embed = pos_embed[0,1:,:].unsqueeze(0).transpose(1, 2)
+        new_pos_embed = F.interpolate(other_pos_embed, size=(num_patches), mode='nearest')
+        new_pos_embed = new_pos_embed.transpose(1, 2)
+        new_pos_embed = torch.cat((cls_pos_embed, new_pos_embed), 1)
+        state_dict['pos_embed'] = new_pos_embed
+    ## Resizing time embeddings in case they don't match
+    if 'time_embed' in state_dict and num_frames != state_dict['time_embed'].size(1):
+        time_embed = state_dict['time_embed'].transpose(1, 2)
+        new_time_embed = F.interpolate(time_embed, size=(num_frames), mode='nearest')
+        state_dict['time_embed'] = new_time_embed.transpose(1, 2)
+    ## Initializing temporal attention
+    if attention_type == 'divided_space_time':
+        new_state_dict = state_dict.copy()
+        for key in state_dict:
+            if 'blocks' in key and 'attn' in key:
+                new_key = key.replace('attn','temporal_attn')
+                if not new_key in state_dict:
+                   new_state_dict[new_key] = state_dict[key]
+                else:
+                   new_state_dict[new_key] = state_dict[new_key]
+            if 'blocks' in key and 'norm1' in key:
+                new_key = key.replace('norm1','temporal_norm1')
+                if not new_key in state_dict:
+                   new_state_dict[new_key] = state_dict[key]
+                else:
+                   new_state_dict[new_key] = state_dict[new_key]
+        state_dict = new_state_dict
+    ## Loading the weights
+    model.load_state_dict(state_dict, strict=False)
+def extract_layer(model, layer):
+    layer = layer.split('.')
+    module = model
+    if hasattr(model, 'module') and layer[0] != 'module':
+        module = model.module
+    if not hasattr(model, 'module') and layer[0] == 'module':
+        layer = layer[1:]
+    for l in layer:
+        if hasattr(module, l):
+            if not l.isdigit():
+                module = getattr(module, l)
+            else:
+                module = module[int(l)]
+        else:
+            return module
+    return module
+def set_layer(model, layer, val):
+    layer = layer.split('.')
+    module = model
+    if hasattr(model, 'module') and layer[0] != 'module':
+        module = model.module
+    lst_index = 0
+    module2 = module
+    for l in layer:
+        if hasattr(module2, l):
+            if not l.isdigit():
+                module2 = getattr(module2, l)
+            else:
+                module2 = module2[int(l)]
+            lst_index += 1
+    lst_index -= 1
+    for l in layer[:lst_index]:
+        if not l.isdigit():
+            module = getattr(module, l)
+        else:
+            module = module[int(l)]
+    l = layer[lst_index]
+    setattr(module, l, val)
+def adapt_model_from_string(parent_module, model_string):
+    separator = '***'
+    state_dict = {}
+    lst_shape = model_string.split(separator)
+    for k in lst_shape:
+        k = k.split(':')
+        key = k[0]
+        shape = k[1][1:-1].split(',')
+        if shape[0] != '':
+            state_dict[key] = [int(i) for i in shape]
+    new_module = deepcopy(parent_module)
+    for n, m in parent_module.named_modules():
+        old_module = extract_layer(parent_module, n)
+        if isinstance(old_module, nn.Conv2d) or isinstance(old_module, Conv2dSame):
+            if isinstance(old_module, Conv2dSame):
+                conv = Conv2dSame
+            else:
+                conv = nn.Conv2d
+            s = state_dict[n + '.weight']
+            in_channels = s[1]
+            out_channels = s[0]
+            g = 1
+            if old_module.groups > 1:
+                in_channels = out_channels
+                g = in_channels
+            new_conv = conv(
+                in_channels=in_channels, out_channels=out_channels, kernel_size=old_module.kernel_size,
+                bias=old_module.bias is not None, padding=old_module.padding, dilation=old_module.dilation,
+                groups=g, stride=old_module.stride)
+            set_layer(new_module, n, new_conv)
+        if isinstance(old_module, nn.BatchNorm2d):
+            new_bn = nn.BatchNorm2d(
+                num_features=state_dict[n + '.weight'][0], eps=old_module.eps, momentum=old_module.momentum,
+                affine=old_module.affine, track_running_stats=True)
+            set_layer(new_module, n, new_bn)
+        if isinstance(old_module, nn.Linear):
+            num_features = state_dict[n + '.weight'][1]
+            new_fc = Linear(
+                in_features=num_features, out_features=old_module.out_features, bias=old_module.bias is not None)
+            set_layer(new_module, n, new_fc)
+            if hasattr(new_module, 'num_features'):
+                new_module.num_features = num_features
+    new_module.eval()
+    parent_module.eval()
+    return new_module
+def adapt_model_from_file(parent_module, model_variant):
+    adapt_file = os.path.join(os.path.dirname(__file__), 'pruned', model_variant + '.txt')
+    with open(adapt_file, 'r') as f:
+        return adapt_model_from_string(parent_module, f.read().strip())
+def default_cfg_for_features(default_cfg):
+    default_cfg = deepcopy(default_cfg)
+    # remove default pretrained cfg fields that don't have much relevance for feature backbone
+    to_remove = ('num_classes', 'crop_pct', 'classifier')  # add default final pool size?
+    for tr in to_remove:
+        default_cfg.pop(tr, None)
+    return default_cfg
+def build_model_with_cfg(
+        model_cls: Callable,
+        variant: str,
+        pretrained: bool,
+        default_cfg: dict,
+        model_cfg: dict = None,
+        feature_cfg: dict = None,
+        pretrained_strict: bool = True,
+        pretrained_filter_fn: Callable = None,
+        **kwargs):
+    pruned = kwargs.pop('pruned', False)
+    features = False
+    feature_cfg = feature_cfg or {}
+    if kwargs.pop('features_only', False):
+        features = True
+        feature_cfg.setdefault('out_indices', (0, 1, 2, 3, 4))
+        if 'out_indices' in kwargs:
+            feature_cfg['out_indices'] = kwargs.pop('out_indices')
+    model = model_cls(**kwargs) if model_cfg is None else model_cls(cfg=model_cfg, **kwargs)
+    model.default_cfg = deepcopy(default_cfg)
+    if pruned:
+        model = adapt_model_from_file(model, variant)
+    # for classification models, check class attr, then kwargs, then default to 1k, otherwise 0 for feats
+    num_classes_pretrained = 0 if features else getattr(model, 'num_classes', kwargs.get('num_classes', 1000))
+    if pretrained:
+        load_pretrained(
+            model,
+            num_classes=num_classes_pretrained, in_chans=kwargs.get('in_chans', 3),
+            filter_fn=pretrained_filter_fn, strict=pretrained_strict)
+    if features:
+        feature_cls = FeatureListNet
+        if 'feature_cls' in feature_cfg:
+            feature_cls = feature_cfg.pop('feature_cls')
+            if isinstance(feature_cls, str):
+                feature_cls = feature_cls.lower()
+                if 'hook' in feature_cls:
+                    feature_cls = FeatureHookNet
+                else:
+                    assert False, f'Unknown feature class {feature_cls}'
+        model = feature_cls(model, **feature_cfg)
+        model.default_cfg = default_cfg_for_features(default_cfg)  # add back default_cfg
+    return model