Spaces:

kadirnar
/

Open-Sora

Runtime error

App Files Files Community

kadirnar commited on Mar 18

Commit

1fb533f

•

1 Parent(s): 003dd8b

up

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

apex/.github/ISSUE_TEMPLATE/bug_report.md +0 -23
apex/.gitignore +0 -147
apex/.gitmodules +0 -7
apex/.nojekyll +0 -0
apex/LICENSE +0 -11
apex/README.md +0 -187
apex/apex/RNN/README.md +0 -3
apex/apex/RNN/RNNBackend.py +0 -365
apex/apex/RNN/__init__.py +0 -3
apex/apex/RNN/cells.py +0 -84
apex/apex/RNN/models.py +0 -56
apex/apex/__init__.py +0 -68
apex/apex/_autocast_utils.py +0 -26
apex/apex/amp/README.md +0 -72
apex/apex/amp/__init__.py +0 -5
apex/apex/amp/__version__.py +0 -2
apex/apex/amp/_amp_state.py +0 -59
apex/apex/amp/_initialize.py +0 -265
apex/apex/amp/_process_optimizer.py +0 -489
apex/apex/amp/amp.py +0 -183
apex/apex/amp/compat.py +0 -46
apex/apex/amp/frontend.py +0 -446
apex/apex/amp/handle.py +0 -281
apex/apex/amp/lists/__init__.py +0 -0
apex/apex/amp/lists/functional_overrides.py +0 -80
apex/apex/amp/lists/tensor_overrides.py +0 -63
apex/apex/amp/lists/torch_overrides.py +0 -115
apex/apex/amp/opt.py +0 -103
apex/apex/amp/rnn_compat.py +0 -53
apex/apex/amp/scaler.py +0 -217
apex/apex/amp/utils.py +0 -210
apex/apex/amp/wrap.py +0 -276
apex/apex/contrib/__init__.py +0 -0
apex/apex/contrib/bottleneck/__init__.py +0 -2
apex/apex/contrib/bottleneck/bottleneck.py +0 -749
apex/apex/contrib/bottleneck/halo_exchangers.py +0 -180
apex/apex/contrib/bottleneck/test.py +0 -71
apex/apex/contrib/clip_grad/__init__.py +0 -1
apex/apex/contrib/clip_grad/clip_grad.py +0 -128
apex/apex/contrib/conv_bias_relu/__init__.py +0 -2
apex/apex/contrib/conv_bias_relu/conv_bias_relu.py +0 -104
apex/apex/contrib/csrc/bottleneck/bottleneck.cpp +0 -0
apex/apex/contrib/csrc/conv_bias_relu/conv_bias_relu.cpp +0 -2153
apex/apex/contrib/csrc/cudnn_gbn/cudnn_gbn.cpp +0 -163
apex/apex/contrib/csrc/cudnn_gbn/norm_sample.cpp +0 -479
apex/apex/contrib/csrc/cudnn_gbn/norm_sample.h +0 -153
apex/apex/contrib/csrc/fmha/fmha_api.cpp +0 -365
apex/apex/contrib/csrc/fmha/src/fmha.h +0 -163
apex/apex/contrib/csrc/fmha/src/fmha/gemm.h +0 -314
apex/apex/contrib/csrc/fmha/src/fmha/gmem_tile.h +0 -456

apex/.github/ISSUE_TEMPLATE/bug_report.md DELETED Viewed

@@ -1,23 +0,0 @@
----
-name: Bug report
-about: Create a report to help us improve apex
-title: ''
-labels: bug
-assignees: ''
----
-**Describe the Bug**
-**Minimal Steps/Code to Reproduce the Bug**
-<!--
-Please list the *minimal* steps or provide a code snippet for us to be able to reproduce the bug.
-A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
--->
-**Expected Behavior**
-<!-- A clear and concise description of what you expected to happen. -->
-**Environment**
-<!-- OS, version of Python, CUDA, PyTorch; collect these via `python -m torch.utils.collect_env` -->

apex/.gitignore DELETED Viewed

@@ -1,147 +0,0 @@
-apex.egg-info
-dist
-build
-docs/build
-*~
-__pycache__
-.vscode
-# Copied from https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/

apex/.gitmodules DELETED Viewed

@@ -1,7 +0,0 @@
-[submodule "apex/contrib/csrc/multihead_attn/cutlass"]
-	path = apex/contrib/csrc/multihead_attn/cutlass
-	url = https://github.com/NVIDIA/cutlass.git
-	branch = v1.2.0
-[submodule "apex/contrib/csrc/cudnn-frontend"]
-	path = apex/contrib/csrc/cudnn-frontend
-	url = https://github.com/NVIDIA/cudnn-frontend.git

apex/.nojekyll DELETED Viewed

File without changes

apex/LICENSE DELETED Viewed

@@ -1,11 +0,0 @@
-All rights reserved.
-Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
-1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
-3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

apex/README.md DELETED Viewed

@@ -1,187 +0,0 @@
-# Introduction
-This repository holds NVIDIA-maintained utilities to streamline mixed precision and distributed training in Pytorch.
-Some of the code here will be included in upstream Pytorch eventually.
-The intent of Apex is to make up-to-date utilities available to users as quickly as possible.
-## Full API Documentation: [https://nvidia.github.io/apex](https://nvidia.github.io/apex)
-## [GTC 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/GTC_2019) and [Pytorch DevCon 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/Pytorch_Devcon_2019) Slides
-# Contents
-## 1. Amp:  Automatic Mixed Precision
-**Deprecated. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)**
-`apex.amp` is a tool to enable mixed precision training by changing only 3 lines of your script.
-Users can easily experiment with different pure and mixed precision training modes by supplying
-different flags to `amp.initialize`.
-[Webinar introducing Amp](https://info.nvidia.com/webinar-mixed-precision-with-pytorch-reg-page.html)
-(The flag `cast_batchnorm` has been renamed to `keep_batchnorm_fp32`).
-[API Documentation](https://nvidia.github.io/apex/amp.html)
-[Comprehensive Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
-[DCGAN example coming soon...](https://github.com/NVIDIA/apex/tree/master/examples/dcgan)
-[Moving to the new Amp API](https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users) (for users of the deprecated "Amp" and "FP16_Optimizer" APIs)
-## 2. Distributed Training
-**`apex.parallel.DistributedDataParallel` is deprecated. Use [`torch.nn.parallel.DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html?highlight=distributeddataparallel#torch.nn.parallel.DistributedDataParallel)**
-`apex.parallel.DistributedDataParallel` is a module wrapper, similar to
-`torch.nn.parallel.DistributedDataParallel`.  It enables convenient multiprocess distributed training,
-optimized for NVIDIA's NCCL communication library.
-[API Documentation](https://nvidia.github.io/apex/parallel.html)
-[Python Source](https://github.com/NVIDIA/apex/tree/master/apex/parallel)
-[Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed)
-The [Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
-shows use of `apex.parallel.DistributedDataParallel` along with `apex.amp`.
-### Synchronized Batch Normalization
-**Deprecated. Use [`torch.nn.SyncBatchNorm`](https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html)**
-`apex.parallel.SyncBatchNorm` extends `torch.nn.modules.batchnorm._BatchNorm` to
-support synchronized BN.
-It allreduces stats across processes during multiprocess (DistributedDataParallel) training.
-Synchronous BN has been used in cases where only a small
-local minibatch can fit on each GPU.
-Allreduced stats increase the effective batch size for the BN layer to the
-global batch size across all processes (which, technically, is the correct
-formulation).
-Synchronous BN has been observed to improve converged accuracy in some of our research models.
-### Checkpointing
-To properly save and load your `amp` training, we introduce the `amp.state_dict()`, which contains all `loss_scalers` and their corresponding unskipped steps,
-as well as `amp.load_state_dict()` to restore these attributes.
-In order to get bitwise accuracy, we recommend the following workflow:
-```python
-# Initialization
-opt_level = 'O1'
-model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
-# Train your model
-...
-with amp.scale_loss(loss, optimizer) as scaled_loss:
-    scaled_loss.backward()
-...
-# Save checkpoint
-checkpoint = {
-    'model': model.state_dict(),
-    'optimizer': optimizer.state_dict(),
-    'amp': amp.state_dict()
-}
-torch.save(checkpoint, 'amp_checkpoint.pt')
-...
-# Restore
-model = ...
-optimizer = ...
-checkpoint = torch.load('amp_checkpoint.pt')
-model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
-model.load_state_dict(checkpoint['model'])
-optimizer.load_state_dict(checkpoint['optimizer'])
-amp.load_state_dict(checkpoint['amp'])
-# Continue training
-...
-```
-Note that we recommend restoring the model using the same `opt_level`. Also note that we recommend calling the `load_state_dict` methods after `amp.initialize`.
-# Installation
-Each [`apex.contrib`](./apex/contrib) module requires one or more install options other than `--cpp_ext` and `--cuda_ext`.
-Note that contrib modules do not necessarily support stable PyTorch releases.
-## Containers
-NVIDIA PyTorch Containers are available on NGC: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch.
-The containers come with all the custom extensions available at the moment.
-See [the NGC documentation](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html) for details such as:
-- how to pull a container
-- how to run a pulled container
-- release notes
-## From Source
-To install Apex from source, we recommend using the nightly Pytorch obtainable from https://github.com/pytorch/pytorch.
-The latest stable release obtainable from https://pytorch.org should also work.
-We recommend installing [`Ninja`](https://ninja-build.org/) to make compilation faster.
-### Linux
-For performance and full functionality, we recommend installing Apex with
-CUDA and C++ extensions via
-```bash
-git clone https://github.com/NVIDIA/apex
-cd apex
-# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
-pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
-# otherwise
-pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./
-```
-APEX also supports a Python-only build via
-```bash
-pip install -v --disable-pip-version-check --no-build-isolation --no-cache-dir ./
-```
-A Python-only build omits:
-- Fused kernels required to use `apex.optimizers.FusedAdam`.
-- Fused kernels required to use `apex.normalization.FusedLayerNorm` and `apex.normalization.FusedRMSNorm`.
-- Fused kernels that improve the performance and numerical stability of `apex.parallel.SyncBatchNorm`.
-- Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
-`DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.
-### [Experimental] Windows
-`pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" .` may work if you were able to build Pytorch from source
-on your system. A Python-only build via `pip install -v --no-cache-dir .` is more likely to work.
-If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
-## Custom C++/CUDA Extensions and Install Options
-If a requirement of a module is not met, then it will not be built.
-|  Module Name  |  Install Option  |  Misc  |
-|---------------|------------------|--------|
-|  `apex_C`     |  `--cpp_ext`     | |
-|  `amp_C`      |  `--cuda_ext`    | |
-|  `syncbn`     |  `--cuda_ext`    | |
-|  `fused_layer_norm_cuda`  |  `--cuda_ext`  | [`apex.normalization`](./apex/normalization) |
-|  `mlp_cuda`   |  `--cuda_ext`    | |
-|  `scaled_upper_triang_masked_softmax_cuda`  |  `--cuda_ext`  | |
-|  `generic_scaled_masked_softmax_cuda`  |  `--cuda_ext`  | |
-|  `scaled_masked_softmax_cuda`  |  `--cuda_ext`  | |
-|  `fused_weight_gradient_mlp_cuda`  |  `--cuda_ext`  | Requires CUDA>=11 |
-|  `permutation_search_cuda`  |  `--permutation_search`  | [`apex.contrib.sparsity`](./apex/contrib/sparsity)  |
-|  `bnp`        |  `--bnp`         |  [`apex.contrib.groupbn`](./apex/contrib/groupbn) |
-|  `xentropy`   |  `--xentropy`    |  [`apex.contrib.xentropy`](./apex/contrib/xentropy)  |
-|  `focal_loss_cuda`  |  `--focal_loss`  |  [`apex.contrib.focal_loss`](./apex/contrib/focal_loss)  |
-|  `fused_index_mul_2d`  |  `--index_mul_2d`  |  [`apex.contrib.index_mul_2d`](./apex/contrib/index_mul_2d)  |
-|  `fused_adam_cuda`  |  `--deprecated_fused_adam`  |  [`apex.contrib.optimizers`](./apex/contrib/optimizers)  |
-|  `fused_lamb_cuda`  |  `--deprecated_fused_lamb`  |  [`apex.contrib.optimizers`](./apex/contrib/optimizers)  |
-|  `fast_layer_norm`  |  `--fast_layer_norm`  |  [`apex.contrib.layer_norm`](./apex/contrib/layer_norm). different from `fused_layer_norm` |
-|  `fmhalib`    |  `--fmha`        |  [`apex.contrib.fmha`](./apex/contrib/fmha)  |
-|  `fast_multihead_attn`  |  `--fast_multihead_attn`  |  [`apex.contrib.multihead_attn`](./apex/contrib/multihead_attn)  |
-|  `transducer_joint_cuda`  |  `--transducer`  |  [`apex.contrib.transducer`](./apex/contrib/transducer)  |
-|  `transducer_loss_cuda`   |  `--transducer`  |  [`apex.contrib.transducer`](./apex/contrib/transducer)  |
-|  `cudnn_gbn_lib`  |  `--cudnn_gbn`  | Requires cuDNN>=8.5, [`apex.contrib.cudnn_gbn`](./apex/contrib/cudnn_gbn) |
-|  `peer_memory_cuda`  |  `--peer_memory`  |  [`apex.contrib.peer_memory`](./apex/contrib/peer_memory)  |
-|  `nccl_p2p_cuda`  |  `--nccl_p2p`  | Requires NCCL >= 2.10, [`apex.contrib.nccl_p2p`](./apex/contrib/nccl_p2p)  |
-|  `fast_bottleneck`  |  `--fast_bottleneck`  |  Requires `peer_memory_cuda` and `nccl_p2p_cuda`, [`apex.contrib.bottleneck`](./apex/contrib/bottleneck) |
-|  `fused_conv_bias_relu`  |  `--fused_conv_bias_relu`  | Requires cuDNN>=8.4, [`apex.contrib.conv_bias_relu`](./apex/contrib/conv_bias_relu) |

apex/apex/RNN/README.md DELETED Viewed

@@ -1,3 +0,0 @@
-**This module will be removed by the end of February 2023**
-Under construction...

apex/apex/RNN/RNNBackend.py DELETED Viewed

@@ -1,365 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-import torch.nn.functional as F
-import math
-def is_iterable(maybe_iterable):
-    return isinstance(maybe_iterable, list) or isinstance(maybe_iterable, tuple)
-def flatten_list(tens_list):
-    """
-    flatten_list
-    """
-    if not is_iterable(tens_list):
-        return tens_list
-    return torch.cat(tens_list, dim=0).view(len(tens_list), *tens_list[0].size() )
-#These modules always assumes batch_first
-class bidirectionalRNN(nn.Module):
-    """
-    bidirectionalRNN
-    """
-    def __init__(self, inputRNN, num_layers=1, dropout = 0):
-        super(bidirectionalRNN, self).__init__()
-        self.dropout = dropout
-        self.fwd = stackedRNN(inputRNN, num_layers=num_layers, dropout = dropout)
-        self.bckwrd = stackedRNN(inputRNN.new_like(), num_layers=num_layers, dropout = dropout)
-        self.rnns = nn.ModuleList([self.fwd, self.bckwrd])
-    #collect hidden option will return all hidden/cell states from entire RNN
-    def forward(self, input, collect_hidden=False):
-        """
-        forward()
-        """
-        seq_len = input.size(0)
-        bsz = input.size(1)
-        fwd_out, fwd_hiddens = list(self.fwd(input, collect_hidden = collect_hidden))
-        bckwrd_out, bckwrd_hiddens = list(self.bckwrd(input, reverse=True, collect_hidden = collect_hidden))
-        output = torch.cat( [fwd_out, bckwrd_out], -1 )
-        hiddens = tuple( torch.cat(hidden, -1) for hidden in zip( fwd_hiddens, bckwrd_hiddens) )
-        return output, hiddens
-    def reset_parameters(self):
-        """
-        reset_parameters()
-        """
-        for rnn in self.rnns:
-            rnn.reset_parameters()
-    def init_hidden(self, bsz):
-        """
-        init_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.init_hidden(bsz)
-    def detach_hidden(self):
-        """
-        detach_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.detachHidden()
-    def reset_hidden(self, bsz):
-        """
-        reset_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.reset_hidden(bsz)
-    def init_inference(self, bsz):
-        """
-        init_inference()
-        """
-        for rnn in self.rnns:
-            rnn.init_inference(bsz)
-#assumes hidden_state[0] of inputRNN is output hidden state
-#constructor either takes an RNNCell or list of RNN layers
-class stackedRNN(nn.Module):
-    """
-    stackedRNN
-    """
-    def __init__(self, inputRNN, num_layers=1, dropout=0):
-        super(stackedRNN, self).__init__()
-        self.dropout = dropout
-        if isinstance(inputRNN, RNNCell):
-            self.rnns = [inputRNN]
-            for i in range(num_layers-1):
-                self.rnns.append(inputRNN.new_like(inputRNN.output_size))
-        elif isinstance(inputRNN, list):
-            assert len(inputRNN) == num_layers, "RNN list length must be equal to num_layers"
-            self.rnns=inputRNN
-        else:
-            raise RuntimeError()
-        self.nLayers = len(self.rnns)
-        self.rnns = nn.ModuleList(self.rnns)
-    '''
-    Returns output as hidden_state[0] Tensor([sequence steps][batch size][features])
-    If collect hidden will also return Tuple(
-        [n_hidden_states][sequence steps] Tensor([layer][batch size][features])
-    )
-    If not collect hidden will also return Tuple(
-        [n_hidden_states] Tensor([layer][batch size][features])
-    '''
-    def forward(self, input, collect_hidden=False, reverse=False):
-        """
-        forward()
-        """
-        seq_len = input.size(0)
-        bsz = input.size(1)
-        inp_iter = reversed(range(seq_len)) if reverse else range(seq_len)
-        hidden_states = [[] for i in range(self.nLayers)]
-        outputs = []
-        for seq in inp_iter:
-            for layer in range(self.nLayers):
-                if layer == 0:
-                    prev_out = input[seq]
-                outs = self.rnns[layer](prev_out)
-                if collect_hidden:
-                    hidden_states[layer].append(outs)
-                elif seq == seq_len-1:
-                    hidden_states[layer].append(outs)
-                prev_out = outs[0]
-            outputs.append(prev_out)
-        if reverse:
-            outputs = list(reversed(outputs))
-        '''
-        At this point outputs is in format:
-        list( [seq_length] x Tensor([bsz][features]) )
-        need to convert it to:
-        list( Tensor([seq_length][bsz][features]) )
-        '''
-        output = flatten_list(outputs)
-        '''
-        hidden_states at this point is in format:
-        list( [layer][seq_length][hidden_states] x Tensor([bsz][features]) )
-        need to convert it to:
-          For not collect hidden:
-            list( [hidden_states] x Tensor([layer][bsz][features]) )
-          For collect hidden:
-            list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
-        '''
-        if not collect_hidden:
-            seq_len = 1
-        n_hid = self.rnns[0].n_hidden_states
-        new_hidden = [ [ [ None for k in range(self.nLayers)] for j in range(seq_len) ] for i in range(n_hid) ]
-        for i in range(n_hid):
-            for j in range(seq_len):
-                for k in range(self.nLayers):
-                    new_hidden[i][j][k] = hidden_states[k][j][i]
-        hidden_states = new_hidden
-        #Now in format list( [hidden_states][seq_length][layer] x Tensor([bsz][features]) )
-        #Reverse seq_length if reverse
-        if reverse:
-            hidden_states = list( list(reversed(list(entry))) for entry in hidden_states)
-        #flatten layer dimension into tensor
-        hiddens = list( list(
-            flatten_list(seq) for seq in hidden )
-                        for hidden in hidden_states )
-        #Now in format list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
-        #Remove seq_length dimension if not collect_hidden
-        if not collect_hidden:
-            hidden_states = list( entry[0] for entry in hidden_states)
-        return output, hidden_states
-    def reset_parameters(self):
-        """
-        reset_parameters()
-        """
-        for rnn in self.rnns:
-            rnn.reset_parameters()
-    def init_hidden(self, bsz):
-        """
-        init_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.init_hidden(bsz)
-    def detach_hidden(self):
-        """
-        detach_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.detach_hidden()
-    def reset_hidden(self, bsz):
-        """
-        reset_hidden()
-        """
-        for rnn in self.rnns:
-            rnn.reset_hidden(bsz)
-    def init_inference(self, bsz):
-        """
-        init_inference()
-        """
-        for rnn in self.rnns:
-            rnn.init_inference(bsz)
-class RNNCell(nn.Module):
-    """
-    RNNCell
-    gate_multiplier is related to the architecture you're working with
-    For LSTM-like it will be 4 and GRU-like will be 3.
-    Always assumes input is NOT batch_first.
-    Output size that's not hidden size will use output projection
-    Hidden_states is number of hidden states that are needed for cell
-    if one will go directly to cell as tensor, if more will go as list
-    """
-    def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_hidden_states = 2, bias = False, output_size = None):
-        super(RNNCell, self).__init__()
-        self.gate_multiplier = gate_multiplier
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.cell = cell
-        self.bias = bias
-        self.output_size = output_size
-        if output_size is None:
-            self.output_size = hidden_size
-        self.gate_size = gate_multiplier * self.hidden_size
-        self.n_hidden_states = n_hidden_states
-        self.w_ih = nn.Parameter(torch.empty(self.gate_size, self.input_size))
-        self.w_hh = nn.Parameter(torch.empty(self.gate_size, self.output_size))
-        #Check if there's recurrent projection
-        if(self.output_size != self.hidden_size):
-            self.w_ho = nn.Parameter(torch.empty(self.output_size, self.hidden_size))
-        self.b_ih = self.b_hh = None
-        if self.bias:
-            self.b_ih = nn.Parameter(torch.empty(self.gate_size))
-            self.b_hh = nn.Parameter(torch.empty(self.gate_size))
-        #hidden states for forward
-        self.hidden = [ None for states in range(self.n_hidden_states)]
-        self.reset_parameters()
-    def new_like(self, new_input_size=None):
-        """
-        new_like()
-        """
-        if new_input_size is None:
-            new_input_size = self.input_size
-        return type(self)(self.gate_multiplier,
-                       new_input_size,
-                       self.hidden_size,
-                       self.cell,
-                       self.n_hidden_states,
-                       self.bias,
-                       self.output_size)
-    #Use xavier where we can (weights), otherwise use uniform (bias)
-    def reset_parameters(self, gain=1):
-        """
-        reset_parameters()
-        """
-        stdev = 1.0 / math.sqrt(self.hidden_size)
-        for param in self.parameters():
-            param.data.uniform_(-stdev, stdev)
-    '''
-    Xavier reset:
-    def reset_parameters(self, gain=1):
-        stdv = 1.0 / math.sqrt(self.gate_size)
-        for param in self.parameters():
-            if (param.dim() > 1):
-                torch.nn.init.xavier_normal(param, gain)
-            else:
-                param.data.uniform_(-stdv, stdv)
-    '''
-    def init_hidden(self, bsz):
-        """
-        init_hidden()
-        """
-        for param in self.parameters():
-            if param is not None:
-                a_param = param
-                break
-        for i, _ in enumerate(self.hidden):
-            if(self.hidden[i] is None or self.hidden[i].data.size()[0] != bsz):
-                if i==0:
-                    hidden_size = self.output_size
-                else:
-                    hidden_size = self.hidden_size
-                tens = a_param.data.new(bsz, hidden_size).zero_()
-                self.hidden[i] = Variable(tens, requires_grad=False)
-    def reset_hidden(self, bsz):
-        """
-        reset_hidden()
-        """
-        for i, _ in enumerate(self.hidden):
-            self.hidden[i] = None
-        self.init_hidden(bsz)
-    def detach_hidden(self):
-        """
-        detach_hidden()
-        """
-        for i, _ in enumerate(self.hidden):
-            if self.hidden[i] is None:
-                raise RuntimeError("Must initialize hidden state before you can detach it")
-        for i, _ in enumerate(self.hidden):
-            self.hidden[i] = self.hidden[i].detach()
-    def forward(self, input):
-        """
-        forward()
-        if not inited or bsz has changed this will create hidden states
-        """
-        self.init_hidden(input.size()[0])
-        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
-        self.hidden = self.cell(input, hidden_state, self.w_ih, self.w_hh, b_ih=self.b_ih, b_hh=self.b_hh)
-        if(self.n_hidden_states > 1):
-            self.hidden = list(self.hidden)
-        else:
-            self.hidden=[self.hidden]
-        if self.output_size != self.hidden_size:
-            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
-        return tuple(self.hidden)

apex/apex/RNN/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .models import LSTM, GRU, ReLU, Tanh, mLSTM
-__all__ = ['models']

apex/apex/RNN/cells.py DELETED Viewed

@@ -1,84 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .RNNBackend import RNNCell
-from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
-import math
-class mLSTMRNNCell(RNNCell):
-    """
-    mLSTMRNNCell
-    """
-    def __init__(self, input_size, hidden_size, bias = False, output_size = None):
-        gate_multiplier = 4
-        super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
-        self.w_mih = nn.Parameter(torch.empty(self.output_size, self.input_size))
-        self.w_mhh = nn.Parameter(torch.empty(self.output_size, self.output_size))
-        self.reset_parameters()
-    def forward(self, input):
-        """
-        mLSTMRNNCell.forward()
-        """
-        #if not inited or bsz has changed this will create hidden states
-        self.init_hidden(input.size()[0])
-        hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
-        self.hidden = list(
-                           self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
-                           b_ih=self.b_ih, b_hh=self.b_hh)
-        )
-        if self.output_size != self.hidden_size:
-            self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
-        return tuple(self.hidden)
-    def new_like(self, new_input_size=None):
-        if new_input_size is None:
-            new_input_size = self.input_size
-        return type(self)(
-            new_input_size,
-            self.hidden_size,
-            self.bias,
-            self.output_size)
-def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
-    """
-    mLSTMCell
-    """
-    if input.is_cuda:
-        igates = F.linear(input, w_ih)
-        m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
-        hgates = F.linear(m, w_hh)
-        state = fusedBackend.LSTMFused.apply
-        return state(igates, hgates, hidden[1], b_ih, b_hh)
-    hx, cx = hidden
-    m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
-    gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)
-    ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
-    ingate = F.sigmoid(ingate)
-    forgetgate = F.sigmoid(forgetgate)
-    cellgate = F.tanh(cellgate)
-    outgate = F.sigmoid(outgate)
-    cy = (forgetgate * cx) + (ingate * cellgate)
-    hy = outgate * F.tanh(cy)
-    return hy, cy

apex/apex/RNN/models.py DELETED Viewed

@@ -1,56 +0,0 @@
-import torch
-from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell
-from apex import deprecated_warning
-from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
-from .cells import mLSTMRNNCell, mLSTMCell
-def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
-    """
-    :class:`toRNNBackend`
-    """
-    deprecated_warning("`apex.RNN` is deprecated and will be removed by the end of February 2023.")
-    if bidirectional:
-        return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
-    else:
-        return stackedRNN(inputRNN, num_layers, dropout = dropout)
-def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`LSTM`
-    """
-    inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
-def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`GRU`
-    """
-    inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
-def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`ReLU`
-    """
-    inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
-def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`Tanh`
-    """
-    inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
-def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
-    """
-    :class:`mLSTM`
-    """
-    inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
-    return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)

apex/apex/__init__.py DELETED Viewed

@@ -1,68 +0,0 @@
-import logging
-import warnings
-# May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
-import torch
-__all__ = ["amp", "fp16_utils", "optimizers", "normalization", "transformer"]
-if torch.distributed.is_available():
-    from . import parallel
-    __all__.append("parallel")
-from . import amp
-from . import fp16_utils
-# For optimizers and normalization there is no Python fallback.
-# Absence of cuda backend is a hard error.
-# I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
-# to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
-# so they expect those backends to be available, but for some reason they actually aren't
-# available (for example because they built improperly in a way that isn't revealed until
-# load time) the error message is timely and visible.
-from . import optimizers
-from . import normalization
-from . import transformer
-# Logging utilities for apex.transformer module
-class RankInfoFormatter(logging.Formatter):
-    def format(self, record):
-        from apex.transformer.parallel_state import get_rank_info
-        record.rank_info = get_rank_info()
-        return super().format(record)
-_library_root_logger = logging.getLogger(__name__)
-handler = logging.StreamHandler()
-handler.setFormatter(RankInfoFormatter("%(asctime)s - PID:%(process)d - rank:%(rank_info)s - %(filename)s:%(lineno)d - %(levelname)s - %(message)s", "%y-%m-%d %H:%M:%S"))
-_library_root_logger.addHandler(handler)
-_library_root_logger.propagate = False
-def check_cudnn_version_and_warn(global_option: str, required_cudnn_version: int) -> bool:
-    cudnn_available = torch.backends.cudnn.is_available()
-    cudnn_version = torch.backends.cudnn.version() if cudnn_available else None
-    if not (cudnn_available and (cudnn_version >= required_cudnn_version)):
-        warnings.warn(
-            f"`{global_option}` depends on cuDNN {required_cudnn_version} or later, "
-            f"but {'cuDNN is not available' if not cudnn_available else cudnn_version}"
-        )
-        return False
-    return True
-class DeprecatedFeatureWarning(FutureWarning):
-    pass
-def deprecated_warning(msg: str) -> None:
-    if (
-        not torch.distributed.is_available
-        or not torch.distributed.is_initialized()
-        or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)
-    ):
-        warnings.warn(msg, DeprecatedFeatureWarning)

apex/apex/_autocast_utils.py DELETED Viewed

@@ -1,26 +0,0 @@
-from typing import Optional, Sequence
-import torch
-__all__ = ["_cast_if_autocast_enabled"]
-def _get_autocast_dtypes() -> Sequence[torch.dtype]:
-    if torch.cuda.is_bf16_supported():
-        return [torch.half, torch.bfloat16]
-    return [torch.half]
-def _get_current_dtype(dtype: Optional[torch.dtype] = None) -> torch.dtype:
-    if not torch.is_autocast_enabled():
-        return torch.float or dtype
-    else:
-        return torch.get_autocast_gpu_dtype()
-def _cast_if_autocast_enabled(*args):
-    if not torch.is_autocast_enabled():
-        return args
-    else:
-        return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())

apex/apex/amp/README.md DELETED Viewed

@@ -1,72 +0,0 @@
-# amp: Automatic Mixed Precision
-## Annotating User Functions
-Nearly all PyTorch user code needs nothing more than the two steps
-above to use amp. After all, custom layers are built out of simpler
-PyTorch components, and amp already can see those.
-However, any custom C++ or CUDA code is outside of amp's (default)
-view of things. For example, suppose I implemented a new recurrent
-cell called a "forgetful recurrent unit" that calls directly into a
-CUDA backend:
-```python
-from backend import FRUBackend
-def fru(input, hidden, weight, bias):
-    # call to CUDA code
-    FRUBackend(input, hidden, weight, bias)
-```
-In this case, it is possible to get a runtime type mismatch. For
-example, you might have `input` in fp16, and `weight` in fp32, and amp
-doesn't have the visibility to insert an appropriate cast.
-amp exposes two ways to handle "invisible" backend code: function
-annotations and explicit registration.
-#### Function annotation
-The first way to handle backend code is a set of function annotations:
-- `@amp.half_function`
-- `@amp.float_function`
-- `@amp.promote_function`
-These correspond to:
-- Cast all arguments to fp16
-- Cast all argumnets fo fp32
-- If there are any type mismatches, cast everything to the widest type
-In our example, we believe that the FRU unit is fp16-safe and will get
-performance gains from casting its arguments to fp16, so we write:
-```python
-@amp.half_function
-def fru(input, hidden, weight, bias):
-    #...
-```
-#### Explicit registration
-The other way to handle backend code is with explicit function
-registration:
-- `amp.register_half_function(module, function_name)`
-- `amp.register_float_function(module, function_name)`
-- `amp.register_promote_function(module, function_name)`
-When using this API, `module` is the containing class or module for
-the function, and `function_name` is the _string_ name of the
-function. Note that the function must be registered before the call to
-`amp.initalize()`.
-For our FRU unit, we can register the backend function directly:
-```python
-import backend
-amp.register_half_function(backend, 'FRUBackend')
-```

apex/apex/amp/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .amp import init, half_function, float_function, promote_function,\
-    register_half_function, register_float_function, register_promote_function
-from .handle import scale_loss, disable_casts
-from .frontend import initialize, state_dict, load_state_dict
-from ._amp_state import master_params, _amp_state

apex/apex/amp/__version__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- VERSION = (0, 1, 0)
2	- __version__ = '.'.join(map(str, VERSION))

apex/apex/amp/_amp_state.py DELETED Viewed

@@ -1,59 +0,0 @@
-# This is a "header object" that allows different amp modules to communicate.
-# I'm a C++ guy, not a python guy.  I decided this approach because it seemed most C++-like.
-# But apparently it's ok:
-# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
-import torch
-class AmpState(object):
-    def __init__(self):
-        self.hard_override=False
-        self.allow_incoming_model_not_fp32 = False
-        self.verbosity=1
-# Attribute stash.  Could also just stash things as global module attributes.
-_amp_state = AmpState()
-def warn_or_err(msg):
-    if _amp_state.hard_override:
-        print("Warning:  " + msg)
-    else:
-        raise RuntimeError(msg)
-        # I'm not sure if allowing hard_override is a good idea.
-        # + "  If you're sure you know what you're doing, supply " +
-        #                    "hard_override=True to amp.initialize.")
-def maybe_print(msg, rank0=False):
-    distributed = torch.distributed.is_available() and \
-        torch.distributed.is_initialized() and \
-        torch.distributed.get_world_size() > 1
-    if _amp_state.verbosity > 0:
-        if rank0:
-            if distributed:
-                if torch.distributed.get_rank() == 0:
-                    print(msg)
-            else:
-                print(msg)
-        else:
-            print(msg)
-# def iter_params(param_groups):
-#     for group in param_groups:
-#         for p in group['params']:
-#             yield p
-def master_params(optimizer):
-    """
-    Generator expression that iterates over the params owned by ``optimizer``.
-    Args:
-        optimizer: An optimizer previously returned from ``amp.initialize``.
-    """
-    for group in optimizer.param_groups:
-        for p in group['params']:
-            yield p

apex/apex/amp/_initialize.py DELETED Viewed

@@ -1,265 +0,0 @@
-import collections.abc as container_abcs
-from types import MethodType
-import functools
-import sys
-import warnings
-import numpy as np
-import torch
-from ._amp_state import _amp_state, warn_or_err
-from .handle import disable_casts
-from .scaler import LossScaler
-from ._process_optimizer import _process_optimizer
-from apex.fp16_utils import convert_network
-from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
-from ..contrib.optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
-if torch.distributed.is_available():
-    from ..parallel import DistributedDataParallel as apex_DDP
-    from ..parallel.LARC import LARC
-def to_type(dtype, t):
-    if isinstance(t, torch.Tensor):
-        if not t.is_cuda:
-            # This should not be a hard error, since it may be legitimate.
-            warnings.warn("An input tensor was not cuda.")
-        # GANs require this.
-        # if t.requires_grad:
-        #     warn_or_err("input data requires grad.  Since input data is not a model parameter,\n"
-        #         "its gradients will not be properly allreduced by DDP.")
-        if t.is_floating_point():
-            return t.to(dtype)
-        return t
-    else:
-        # Trust the user's custom batch type, that's all I can do here.
-        return t.to(dtype)
-# Modified from torch.optim.optimizer.py.  This is a bit more general than casted_args in utils.py.
-def applier(value, fn):
-    if isinstance(value, torch.Tensor):
-        return fn(value)
-    elif isinstance(value, str):
-        return value
-    elif isinstance(value, np.ndarray):
-        return value
-    elif hasattr(value, "to"): # Allow handling of custom batch classes
-        return fn(value)
-    elif isinstance(value, container_abcs.Mapping):
-        return {applier(k, fn) : applier(v, fn) for k, v in value.items()}
-    elif isinstance(value, container_abcs.Iterable):
-        return type(value)(applier(v, fn) for v in value)
-    else:
-        # Do I want this to fire off even if someone chooses to pass something ordinary like
-        # an int or float?  May be more annoying than it's worth.
-        # print("Warning:  unrecognized type in applier.  If your input data is a custom class, "
-        #     "provide it with a .to(dtype) method which converts its floating-point Tensors to dtype. "
-        #     "Amp will check for your custom to() and invoke it to cast the batch's "
-        #     "floating-point Tensors to the appropriate type. "
-        #     "Also, if your data is a custom class, it is your responsibility to ensure that "
-        #     "any Tensors you want to be cuda are already cuda."
-        return value
-def check_models(models):
-    for model in models:
-        parallel_type = None
-        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
-            parallel_type = "torch.nn.parallel.DistributedDataParallel"
-        if ('apex_DDP' in sys.modules) and isinstance(model, apex_DDP):
-            parallel_type = "apex.parallel.DistributedDataParallel"
-        if isinstance(model, torch.nn.parallel.DataParallel):
-            parallel_type = "torch.nn.parallel.DataParallel"
-        if parallel_type is not None:
-            raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
-                "Parallel wrappers should only be applied to the model(s) AFTER \n"
-                "the model(s) have been returned from amp.initialize.")
-def check_params_fp32(models):
-    for model in models:
-        for name, param in model.named_parameters():
-            if param.is_floating_point():
-                if 'Half' in param.type():
-                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
-                        "When using amp.initialize, you do not need to call .half() on your model\n"
-                        "before passing it, no matter what optimization level you choose.".format(
-                        name, param.type()))
-                elif not param.is_cuda:
-                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
-                        "When using amp.initialize, you need to provide a model with parameters\n"
-                        "located on a CUDA device before passing it no matter what optimization level\n"
-                        "you chose. Use model.to('cuda') to use the default device.".format(
-                        name, param.type()))
-        # Backward compatibility for PyTorch 0.4
-        if hasattr(model, 'named_buffers'):
-            buf_iter = model.named_buffers()
-        else:
-            buf_iter = model._buffers
-        for obj in buf_iter:
-            if type(obj)==tuple:
-                name, buf = obj
-            else:
-                name, buf = obj, buf_iter[obj]
-            if buf.is_floating_point():
-                if 'Half' in buf.type():
-                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
-                        "When using amp.initialize, you do not need to call .half() on your model\n"
-                        "before passing it, no matter what optimization level you choose.".format(
-                        name, buf.type()))
-                elif not buf.is_cuda:
-                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
-                        "When using amp.initialize, you need to provide a model with buffers\n"
-                        "located on a CUDA device before passing it no matter what optimization level\n"
-                        "you chose. Use model.to('cuda') to use the default device.".format(
-                        name, buf.type()))
-def check_optimizers(optimizers):
-    for optim in optimizers:
-        bad_optim_type = None
-        if isinstance(optim, FP16_Optimizer_general):
-            bad_optim_type = "apex.fp16_utils.FP16_Optimizer"
-        if isinstance(optim, FP16_Optimizer_for_fused):
-            bad_optim_type = "apex.optimizers.FP16_Optimizer"
-        if bad_optim_type is not None:
-            raise RuntimeError("An incoming optimizer is an instance of {}. ".format(bad_optim_type) +
-                               "The optimizer(s) passed to amp.initialize() must be bare \n"
-                               "instances of either ordinary Pytorch optimizers, or Apex fused \n"
-                               "optimizers.\n")
-class O2StateDictHook(object):
-    def __init__(self, fn):
-        self.fn = fn
-    def __call__(self, module, state_dict, prefix, local_metadata):
-        for key in state_dict:
-            param = state_dict[key]
-            if 'Half' in param.type():
-                param = param.to(torch.float32)
-                state_dict[key] = param
-def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
-    from .amp import init as amp_init
-    optimizers_was_list = False
-    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
-        optimizers = [optimizers]
-    elif optimizers is None:
-        optimizers = []
-    elif isinstance(optimizers, list):
-        optimizers_was_list = True
-        check_optimizers(optimizers)
-    else:
-        check_optimizers([optimizers])
-        raise TypeError("optimizers must be either a single optimizer or a list of optimizers.")
-    if isinstance(models, torch.nn.Module):
-        models_was_list = False
-        models = [models]
-    elif isinstance(models, list):
-        models_was_list = True
-    else:
-        raise TypeError("models must be either a single model or a list of models.")
-    check_models(models)
-    if not _amp_state.allow_incoming_model_not_fp32:
-        check_params_fp32(models)
-    # In the future, when FP16_Optimizer can be deprecated and master weights can
-    # become an attribute, remember to stash master weights before casting the model.
-    if properties.cast_model_type:
-        if properties.keep_batchnorm_fp32:
-            for model in models:
-                convert_network(model, properties.cast_model_type)
-        else:
-            for model in models:
-                model.to(properties.cast_model_type)
-        input_caster = functools.partial(to_type, properties.cast_model_type)
-        if cast_model_outputs is not None:
-            output_caster = functools.partial(to_type, cast_model_outputs)
-        else:
-            output_caster = functools.partial(to_type, torch.float32)
-        for model in models:
-            # Patch the forward method to cast incoming data to the correct type, and
-            # outgoing data to float32, so "the user never needs to call .half()."
-            # I like writing things explicitly more than decorators.
-            def patch_forward(old_fwd):
-                def new_fwd(*args, **kwargs):
-                    output = old_fwd(*applier(args, input_caster),
-                                     **applier(kwargs, input_caster))
-                    return applier(output, output_caster)
-                return new_fwd
-            model.forward = patch_forward(model.forward)
-        # State dict trick to recast any preexisting per-param state tensors
-        for optimizer in optimizers:
-            optimizer.load_state_dict(optimizer.state_dict())
-        # patch model.state_dict() to return float32 params
-        for model in models:
-            for module in model.modules():
-                module._register_state_dict_hook(O2StateDictHook(functools.partial(to_type, torch.float32)))
-    elif cast_model_outputs is not None:
-        output_caster = functools.partial(to_type, cast_model_outputs)
-        for model in models:
-            def patch_forward(old_fwd):
-                def new_fwd(*args, **kwargs):
-                    output = old_fwd(*args, **kwargs)
-                    return applier(output, output_caster)
-                return new_fwd
-            model.forward = patch_forward(model.forward)
-    for i, optimizer in enumerate(optimizers):
-        optimizers[i] = _process_optimizer(optimizer, properties)
-    _amp_state.loss_scalers = []
-    for _ in range(num_losses):
-        _amp_state.loss_scalers.append(LossScaler(properties.loss_scale,
-                                                  min_loss_scale=_amp_state.min_loss_scale,
-                                                  max_loss_scale=_amp_state.max_loss_scale))
-    if properties.patch_torch_functions:
-        # handle is unused here. It's accessible later through a global value anyway.
-        handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2))
-        for optimizer in optimizers:
-            # Disable Amp casting for the optimizer step, because it should only be
-            # applied to FP32 master params anyway.
-            def patch_step(old_step):
-                def new_step(self, *args, **kwargs):
-                    with disable_casts():
-                        output = old_step(*args, **kwargs)
-                    return output
-                return new_step
-            optimizer.step = MethodType(patch_step(optimizer.step), optimizer)
-    if optimizers_was_list:
-        if models_was_list:
-            return models, optimizers
-        else:
-            return models[0], optimizers
-    else:
-        if models_was_list:
-            if len(optimizers) == 0:
-                return models
-            else:
-                return models, optimizers[0]
-        else:
-            if len(optimizers) == 0:
-                return models[0]
-            else:
-                return models[0], optimizers[0]

apex/apex/amp/_process_optimizer.py DELETED Viewed

@@ -1,489 +0,0 @@
-import types
-from ..fp16_utils import master_params_to_model_params
-from ..multi_tensor_apply import multi_tensor_applier
-from ._amp_state import maybe_print
-import torch
-from ..optimizers import FusedSGD
-class AmpOptimizerState(object):
-    def __init__(self):
-        pass
-def _master_params_to_model_params(self):
-    stash = self._amp_stash
-    if multi_tensor_applier.available:
-        if len(stash.all_fp16_params) > 0:
-            multi_tensor_applier(
-                stash.multi_tensor_scale,
-                stash.dummy_overflow_buf,
-                [stash.all_fp32_from_fp16_params, stash.all_fp16_params],
-                1.0)
-    else:
-        for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
-            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
-def lazy_init_with_master_weights(self):
-        stash = self._amp_stash
-        stash.fp16_groups = []
-        stash.fp32_from_fp16_groups = []
-        stash.fp32_from_fp32_groups = []
-        for i, param_group in enumerate(self.param_groups):
-            # maybe_print("FP16_Optimizer processing param group {}:".format(i))
-            fp16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_fp16_params_this_group = []
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
-                        # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
-                        #             .format(param.size()))
-                        fp16_params_this_group.append(param)
-                        master_param = param.detach().clone().float()
-                        master_param.requires_grad = True
-                        param_group['params'][i] = master_param
-                        fp32_from_fp16_params_this_group.append(master_param)
-                        # Reset existing state dict key to the new master param.
-                        # We still need to recast per-param state tensors, if any, to FP32.
-                        if param in self.state:
-                           self.state[master_param] = self.state.pop(param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
-                        #             .format(param.size()))
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-                    else:
-                        raise TypeError("Optimizer's parameters must be either "
-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                        "Received {}".format(param.type()))
-            stash.fp16_groups.append(fp16_params_this_group)
-            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
-            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
-        stash.all_fp16_params = []
-        for group in stash.fp16_groups:
-            stash.all_fp16_params += group
-        stash.all_fp32_from_fp16_params = []
-        for group in stash.fp32_from_fp16_groups:
-            stash.all_fp32_from_fp16_params += group
-        stash.all_fp32_from_fp32_params = []
-        for group in stash.fp32_from_fp32_groups:
-            stash.all_fp32_from_fp32_params += group
-        # all_fp16_grad_stash is only needed for fused optimizers.
-        stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
-        # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
-        stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
-        for param in stash.all_fp32_from_fp16_params:
-            param.grad = None
-        for param in stash.all_fp32_from_fp32_params:
-            param.grad = None
-        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
-        self.load_state_dict(self.state_dict())
-def post_backward_models_are_masters(scaler, params, stashed_grads, scale_override=None):
-        grads_have_scale, stashed_have_scale, out_scale = scaler.loss_scale(), 1.0, 1.0
-        # not much to do if scale == 1.0 and static scaling
-        if scaler.loss_scale() == 1.0 and not scaler.dynamic:
-            # Clear the stash.
-            for i in range(len(stashed_grads)):
-                stashed_grads[i] = None
-            return
-        if scale_override is not None:
-            grads_have_scale, stashed_have_scale, out_scale = scale_override
-        # This is a lot of python overhead...
-        grads_needing_unscale = []
-        grads_needing_unscale_with_stash = []
-        stashed = []
-        for param, stashed_grad in zip(params, stashed_grads):
-            if param.grad is None and stashed_grad is not None:
-                param.grad = stashed_grad
-            elif param.grad is not None and stashed_grad is None:
-                grads_needing_unscale.append(param.grad)
-            elif param.grad is not None and stashed_grad is not None:
-                grads_needing_unscale_with_stash.append(param.grad)
-                stashed.append(stashed_grad)
-            else: # param.grad is None and stashed_grad is None
-                continue
-        # unscale() implements grads*(1/scale), so "scale" should be grads_have_scale/out_scale.
-        if len(grads_needing_unscale) > 0:
-            scaler.unscale(
-                grads_needing_unscale,
-                grads_needing_unscale,
-                None, # unused_scale, currently present to avoid API breakage elsewhere
-                models_are_masters=True,
-                scale_override=grads_have_scale/out_scale)
-        if len(grads_needing_unscale_with_stash) > 0:
-            scaler.unscale_with_stashed(
-                grads_needing_unscale_with_stash,
-                stashed,
-                grads_needing_unscale_with_stash,
-                scale_override=(grads_have_scale, stashed_have_scale, out_scale))
-        # Clear the stash.
-        for i in range(len(stashed_grads)):
-            stashed_grads[i] = None
-def prepare_backward_with_master_weights(self):
-    stash = self._amp_stash
-    self._amp_lazy_init()
-    for i, param in enumerate(stash.all_fp16_params):
-        # Set up to leverage grad copy elision.
-        # This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
-        param.grad = None
-    # for i, param in enumerate(stash.all_fp32_from_fp16_params):
-    #     stash.all_fp32_from_fp16_grad_stash[i] = param.grad
-    for i, param in enumerate(stash.all_fp32_from_fp32_params):
-        stash.all_fp32_from_fp32_grad_stash[i] = param.grad
-        # Set up to leverage grad copy elision:
-        param.grad = None
-def post_backward_with_master_weights(self, scaler):
-    stash = self._amp_stash
-    self._amp_lazy_init()
-    # This is a lot of python overhead...
-    fp16_grads_needing_unscale = []
-    new_fp32_grads = []
-    fp16_grads_needing_unscale_with_stash = []
-    preexisting_fp32_grads = []
-    for fp16_param, fp32_param in zip(stash.all_fp16_params,
-                                      stash.all_fp32_from_fp16_params):
-        if fp16_param.grad is None and fp32_param.grad is not None:
-            continue
-        elif fp16_param.grad is not None and fp32_param.grad is None:
-            fp32_param.grad = torch.empty_like(fp32_param)
-            fp16_grads_needing_unscale.append(fp16_param.grad)
-            new_fp32_grads.append(fp32_param.grad)
-        elif fp16_param.grad is not None and fp32_param.grad is not None:
-            fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
-            preexisting_fp32_grads.append(fp32_param.grad)
-        else: # fp16_param.grad is None and fp32_param.grad is None:
-            continue
-    if len(fp16_grads_needing_unscale) > 0:
-        scaler.unscale(
-            fp16_grads_needing_unscale,
-            new_fp32_grads,
-            scaler.loss_scale(),
-            models_are_masters=False)
-    if len(fp16_grads_needing_unscale_with_stash) > 0:
-        scaler.unscale_with_stashed(
-            fp16_grads_needing_unscale_with_stash,
-            preexisting_fp32_grads,
-            preexisting_fp32_grads)
-    # fp32 params can be treated as they would be in the "no_master_weights" case.
-    post_backward_models_are_masters(
-        scaler,
-        stash.all_fp32_from_fp32_params,
-        stash.all_fp32_from_fp32_grad_stash)
-def lazy_init_no_master_weights(self):
-    stash = self._amp_stash
-    stash.all_fp16_params = []
-    stash.all_fp32_params = []
-    for i, param_group in enumerate(self.param_groups):
-        for i, param in enumerate(param_group['params']):
-            if param.type() == 'torch.cuda.HalfTensor':
-                stash.all_fp16_params.append(param)
-            elif param.type() == 'torch.cuda.FloatTensor':
-                stash.all_fp32_params.append(param)
-            else:
-                raise TypeError("Optimizer's parameters must be either "
-                                "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                "Received {}".format(param.type()))
-    stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
-    stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]
-def prepare_backward_no_master_weights(self):
-    stash = self._amp_stash
-    self._amp_lazy_init()
-    for i, param in enumerate(stash.all_fp16_params):
-        stash.all_fp16_grad_stash[i] = param.grad
-        # Set up to leverage grad copy elision:
-        param.grad = None
-    for i, param in enumerate(stash.all_fp32_params):
-        stash.all_fp32_grad_stash[i] = param.grad
-        # Set up to leverage grad copy elision:
-        param.grad = None
-def post_backward_no_master_weights(self, scaler):
-    stash = self._amp_stash
-    self._amp_lazy_init()
-    split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
-             (stash.all_fp32_params, stash.all_fp32_grad_stash))
-    for params, stashed_grads in split_types:
-        post_backward_models_are_masters(scaler, params, stashed_grads)
-#####################################################################################
-# FusedSGD versions
-#####################################################################################
-# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
-# outside the kernel, so we must accumulate directly into the model grads.
-def prepare_backward_with_master_weights_FusedSGD(self):
-    if self.materialize_master_grads:
-        prepare_backward_with_master_weights(self)
-    else:
-        stash = self._amp_stash
-        self._amp_lazy_init()
-        for i, param in enumerate(stash.all_fp16_params):
-            stash.all_fp16_grad_stash[i] = param.grad
-            # Set up to leverage grad copy elision:
-            param.grad = None
-        for i, param in enumerate(stash.all_fp32_from_fp32_params):
-            stash.all_fp32_from_fp32_grad_stash[i] = param.grad
-            # Set up to leverage grad copy elision:
-            param.grad = None
-def post_backward_with_master_weights_FusedSGD(self, scaler):
-    if self.materialize_master_grads:
-        post_backward_with_master_weights(self, scaler)
-    else:
-        stash = self._amp_stash
-        self._amp_lazy_init()
-        grads_have_scale = scaler.loss_scale()
-        stashed_have_scale = self.most_recent_scale
-        out_scale = grads_have_scale
-        if self.scale_set_by_backward:
-            out_scale = min(grads_have_scale, self.most_recent_scale)
-        split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
-                 (stash.all_fp32_from_fp32_params, stash.all_fp32_from_fp32_grad_stash))
-        # unscale_with_stashed() implements grads*1/scale + stashed_grads*1.
-        # stashed_grads are scaled by self.most_recent_scale.
-        for params, stashed_grads in split_types:
-            post_backward_models_are_masters(scaler, params, stashed_grads,
-                                             (grads_have_scale, stashed_have_scale, out_scale))
-        self.most_recent_scale = out_scale
-        self.scale_set_by_backward = True
-def prepare_backward_no_master_weights_FusedSGD(self):
-    prepare_backward_no_master_weights(self)
-def post_backward_no_master_weights_FusedSGD(self, scaler):
-    post_backward_no_master_weights(self, scaler)
-def _amp_lazy_init(self):
-    stash = self._amp_stash
-    if not stash.lazy_init_called:
-        self._lazy_init_maybe_master_weights()
-        stash.lazy_init_called = True
-def _process_optimizer(optimizer, properties):
-    if hasattr(optimizer, "_amp_stash"):
-        raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
-    else:
-        optimizer._amp_stash = AmpOptimizerState()
-    optimizer._amp_stash.lazy_init_called = False
-    optimizer._amp_stash.already_patched = False
-    optimizer._amp_stash.params_have_scaled_gradients = False
-    for name in ("_lazy_init_maybe_master_weights",
-                 "_master_params_to_model_params",
-                 "_prepare_amp_backward",
-                 "_post_amp_backward",
-                 "_amp_lazy_init"):
-        if hasattr(optimizer, name):
-            raise RuntimeError("Incoming optimizer already has {} defined.".format(name))
-    # TODO:  Centralize exposure and import error checking for the C backend.
-    if multi_tensor_applier.available:
-        import amp_C
-        optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
-        optimizer._amp_stash.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
-        optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);
-    if properties.master_weights:
-        optimizer._lazy_init_maybe_master_weights = types.MethodType(
-            lazy_init_with_master_weights, optimizer)
-        optimizer._master_params_to_model_params = types.MethodType(
-            _master_params_to_model_params, optimizer)
-        old_step = optimizer.step
-        def new_step(self, closure=None):
-            if closure is not None:
-                raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
-            retval = old_step()
-            if not isinstance(self, FusedSGD):
-                self._master_params_to_model_params()
-            # Clear the master grads that wouldn't be zeroed by model.zero_grad()
-            for param in self._amp_stash.all_fp32_from_fp16_params:
-                param.grad = None
-            return retval
-        optimizer.step = types.MethodType(new_step, optimizer)
-        old_zero_grad = optimizer.zero_grad
-        def new_zero_grad(self):
-            stash = self._amp_stash
-            self._amp_lazy_init()
-            # Zero the model grads.
-            for param in stash.all_fp16_params:
-                if param.grad is not None:
-                    param.grad.detach_()
-                    param.grad.zero_()
-            for param in stash.all_fp32_from_fp32_params:
-                if param.grad is not None:
-                    param.grad.detach_()
-                    param.grad.zero_()
-            # Clear the master grads that are independent of model grads
-            for param in self._amp_stash.all_fp32_from_fp16_params:
-                param.grad = None
-        optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
-        if isinstance(optimizer, FusedSGD):
-            optimizer._prepare_amp_backward = types.MethodType(
-                prepare_backward_with_master_weights_FusedSGD, optimizer)
-            optimizer._post_amp_backward = types.MethodType(
-                post_backward_with_master_weights_FusedSGD, optimizer)
-        else:
-            optimizer._prepare_amp_backward = types.MethodType(
-                prepare_backward_with_master_weights, optimizer)
-            optimizer._post_amp_backward = types.MethodType(
-                post_backward_with_master_weights, optimizer)
-    else:
-        optimizer._lazy_init_maybe_master_weights = types.MethodType(
-            lazy_init_no_master_weights, optimizer)
-        if isinstance(optimizer, FusedSGD):
-            optimizer._prepare_amp_backward = types.MethodType(
-                prepare_backward_no_master_weights_FusedSGD, optimizer)
-            optimizer._post_amp_backward = types.MethodType(
-                post_backward_no_master_weights_FusedSGD, optimizer)
-        else:
-            optimizer._prepare_amp_backward = types.MethodType(
-                prepare_backward_no_master_weights, optimizer)
-            optimizer._post_amp_backward = types.MethodType(
-                post_backward_no_master_weights, optimizer)
-    optimizer._amp_lazy_init = types.MethodType(_amp_lazy_init, optimizer)
-    old_add_param_group = optimizer.add_param_group
-    def new_add_param_group(self, new_group):
-        stash = self._amp_stash
-        if not stash.lazy_init_called:
-            self._lazy_init_maybe_master_weights()
-            stash.lazy_init_called = True
-        assert isinstance(new_group, dict), "param group must be a dict"
-        new_params = new_group['params']
-        if isinstance(new_params, torch.Tensor):
-            new_group['params'] = [new_params]
-        elif isinstance(new_params, set):
-            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
-                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
-        else:
-            new_group['params'] = list(new_params)
-        if properties.master_weights:
-            # Mutate new_group in-place to use FP32 master params
-            fp16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_fp16_params_this_group = []
-            for i, param in enumerate(new_group['params']):
-                if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
-                        fp16_params_this_group.append(param)
-                        master_param = param.detach().clone().float()
-                        master_param.requires_grad = True
-                        new_group['params'][i] = master_param
-                        fp32_from_fp16_params_this_group.append(master_param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        fp32_params_this_group.append(param)
-                        new_group['params'][i] = param
-                    else:
-                        raise TypeError("Optimizer's parameters must be either "
-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                        "Received {}".format(param.type()))
-            stash.fp16_groups.append(fp16_params_this_group)
-            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
-            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
-            stash.all_fp16_params += fp16_params_this_group
-            stash.all_fp32_from_fp16_params += fp32_from_fp16_params_this_group
-            stash.all_fp32_from_fp32_params += fp32_params_this_group
-            # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
-            stash.all_fp32_from_fp32_grad_stash += [None for _ in fp32_params_this_group]
-            # It should be ok to let params be added with existing .grad attributes.
-            # for param in fp16_params_this_group:
-            #     param.grad = None
-            # for param in fp32_from_fp16_params_this_group:
-            #     param.grad = None
-            # for param in stash.fp32_params_this_group:
-            #     param.grad = None
-        else:
-            for param in new_group['params']:
-                if param.type() == 'torch.cuda.HalfTensor':
-                    stash.all_fp16_params.append(param)
-                    stash.all_fp16_grad_stash.append(None)
-                elif param.type() == 'torch.cuda.FloatTensor':
-                    stash.all_fp32_params.append(param)
-                    stash.all_fp32_grad_stash.append(None)
-                else:
-                    raise TypeError("Optimizer's parameters must be either "
-                                    "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                    "Received {}".format(param.type()))
-        old_add_param_group(new_group)
-    optimizer.add_param_group = types.MethodType(new_add_param_group, optimizer)
-    return optimizer

apex/apex/amp/amp.py DELETED Viewed

@@ -1,183 +0,0 @@
-import functools
-import itertools
-import torch
-from . import compat, rnn_compat, utils, wrap
-from .handle import AmpHandle, NoOpHandle
-from .lists import functional_overrides, torch_overrides, tensor_overrides
-from ._amp_state import _amp_state
-from .frontend import *
-_DECORATOR_HANDLE = None
-_USER_CAST_REGISTRY = set()
-_USER_PROMOTE_REGISTRY = set()
-def _decorator_helper(orig_fn, cast_fn, wrap_fn):
-    def wrapper(*args, **kwargs):
-        handle = _DECORATOR_HANDLE
-        if handle is None or not handle.is_active():
-            return orig_fn(*args, **kwargs)
-        inner_cast_fn = utils.verbosify(cast_fn, orig_fn.__name__,
-                                  handle.verbose)
-        return wrap_fn(orig_fn, inner_cast_fn, handle)(*args, **kwargs)
-    return wrapper
-# Decorator form
-def half_function(fn):
-    from apex import deprecated_warning
-    deprecated_warning("apex.amp is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=True)
-    return _decorator_helper(fn, utils.maybe_half, wrap_fn)
-def float_function(fn):
-    from apex import deprecated_warning
-    deprecated_warning("apex.amp is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-    wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=False)
-    return _decorator_helper(fn, utils.maybe_float, wrap_fn)
-def promote_function(fn):
-    from apex import deprecated_warning
-    deprecated_warning("apex.amp is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-    wrap_fn = functools.partial(wrap.make_promote_wrapper)
-    return _decorator_helper(fn, utils.maybe_float, wrap_fn)
-# Registry form
-def register_half_function(module, name):
-    if not hasattr(module, name):
-        raise ValueError('No function named {} in module {}.'.format(
-            name, module))
-    _USER_CAST_REGISTRY.add((module, name, utils.maybe_half))
-def register_float_function(module, name):
-    if not hasattr(module, name):
-        raise ValueError('No function named {} in module {}.'.format(
-            name, module))
-    _USER_CAST_REGISTRY.add((module, name, utils.maybe_float))
-def register_promote_function(module, name):
-    if not hasattr(module, name):
-        raise ValueError('No function named {} in module {}.'.format(
-            name, module))
-    _USER_PROMOTE_REGISTRY.add((module, name))
-# Top-level function to insert _all_ the hooks.
-def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False):
-    global _DECORATOR_HANDLE
-    if not enabled:
-        handle = NoOpHandle()
-        _DECORATOR_HANDLE = handle
-        return handle
-    handle = AmpHandle(loss_scale, enable_caching, verbose)
-    # 0) Force-{fp16, fp32} for user-annotated functions
-    for mod, fn, cast_fn in _USER_CAST_REGISTRY:
-        try_caching = (cast_fn == utils.maybe_half)
-        wrap.cached_cast(mod, fn, cast_fn, handle,
-                         try_caching, verbose)
-    _USER_CAST_REGISTRY.clear()
-    # 0.5) Force-promote for user-annotated functions
-    for mod, fn in _USER_PROMOTE_REGISTRY:
-        wrap.promote(mod, fn, handle, verbose)
-    _USER_PROMOTE_REGISTRY.clear()
-    # 1) Force-{fp16, fp32} on white- / black-list functions
-    override_modules = [functional_overrides,
-                        torch_overrides,
-                        tensor_overrides]
-    cast_table = [('FP16_FUNCS', utils.maybe_half),
-                  ('FP32_FUNCS', utils.maybe_float)]
-    for module, (list_name, cast_fn) in itertools.product(override_modules,
-                                                          cast_table):
-        for fn in getattr(module, list_name):
-            try_caching = (cast_fn == utils.maybe_half)
-            wrap.cached_cast(module.MODULE, fn, cast_fn, handle,
-                             try_caching, verbose)
-    # 1.5) Pre-0.4, put the blacklist methods on HalfTensor and whitelist
-    #      methods on FloatTensor, since they're distinct types.
-    if compat.tensor_is_float_tensor():
-        for fn in tensor_overrides.FP16_FUNCS:
-            wrap.cached_cast(torch.cuda.FloatTensor, fn, utils.maybe_half,
-                             handle, try_caching=True, verbose=verbose)
-        for fn in tensor_overrides.FP32_FUNCS:
-            wrap.cached_cast(torch.cuda.HalfTensor, fn, utils.maybe_float,
-                             handle, try_caching=False, verbose=verbose)
-    # 2) Enable type-promotion on multi-arg functions and methods.
-    #    NB: special handling for sequence fns (e.g. `torch.cat`).
-    promote_modules = [torch_overrides, tensor_overrides]
-    promote_table = [('CASTS', wrap.promote),
-                     ('SEQUENCE_CASTS', wrap.sequence_promote)]
-    for promote_mod, (list_name, promote_fn) in itertools.product(promote_modules,
-                                                                  promote_table):
-        for fn in getattr(promote_mod, list_name):
-            promote_fn(promote_mod.MODULE, fn, handle, verbose)
-    # 2.5) Pre-0.4, add blacklist methods directly to HalfTensor and FloatTensor types
-    if compat.tensor_is_float_tensor():
-        for cls, (list_name, promote_fn) in itertools.product([torch.cuda.FloatTensor,
-                                                               torch.cuda.HalfTensor],
-                                                              promote_table):
-            for fn in getattr(tensor_overrides, list_name):
-                promote_fn(cls, fn, handle, verbose)
-    # 3) For any in-place version of a blacklist function, error if any input is fp16.
-    #    NB: this is overly conservative.
-    for fn in utils.as_inplace(torch_overrides.FP32_FUNCS):
-        wrap.err_if_any_half(torch_overrides.MODULE, fn, handle)
-    # 3.5) For any in-place blacklist method, error if called on fp16 tensor
-    for fn in utils.as_inplace(tensor_overrides.FP32_FUNCS):
-        wrap.err_if_arg0_half(tensor_overrides.MODULE, fn, handle, verbose)
-        if compat.tensor_is_float_tensor():
-            wrap.err_if_arg0_half(torch.cuda.HalfTensor, fn, handle, verbose)
-    # 4) For other in-place methods, match the type of self tensor
-    for fn in utils.as_inplace(itertools.chain(
-            tensor_overrides.FP16_FUNCS,
-            tensor_overrides.CASTS)):
-        wrap.promote_match_arg0(tensor_overrides.MODULE, fn, handle, verbose)
-        if compat.tensor_is_float_tensor():
-            wrap.promote_match_arg0(torch.cuda.HalfTensor, fn, handle, verbose)
-            wrap.promote_match_arg0(torch.cuda.FloatTensor, fn, handle, verbose)
-    # 5) RNNs + RNN cells are whitelisted specially
-    if rnn_compat.has_old_rnns():
-        wrap.rnn_cast(torch.nn.backends.thnn.backend, 'RNN', handle, verbose)
-    if not rnn_compat.has_old_rnns():
-        # Patch in our own indirection of `_VF` in modules/rnn s.t. it is mutable.
-        torch.nn.modules.rnn._VF = rnn_compat.VariableFunctionsShim()
-        # Wrap all the rnns
-        for x in rnn_compat.RNN_NAMES:
-            wrap.new_rnn_cast(x.upper(), handle, verbose)
-    # Wrap all the RNN cells
-    rnn_compat.whitelist_rnn_cells(handle, verbose)
-    # 6) Place error+print message on banned functions.
-    #    Or, if allow_banned, then cast to FP32.
-    for fn, err_msg in functional_overrides.BANNED_FUNCS:
-        if allow_banned:
-            wrap.cached_cast(functional_overrides.MODULE, fn, utils.maybe_float,
-                             handle, try_caching=True, verbose=verbose)
-        else:
-            wrap.err_if_any_half(functional_overrides.MODULE, fn, handle, err_msg)
-    _DECORATOR_HANDLE = handle
-    _amp_state.handle = handle
-    return handle

apex/apex/amp/compat.py DELETED Viewed

@@ -1,46 +0,0 @@
-import torch
-# True for post-0.4, when Variables/Tensors merged.
-def variable_is_tensor():
-    v = torch.autograd.Variable()
-    return isinstance(v, torch.Tensor)
-def tensor_is_variable():
-    x = torch.Tensor()
-    return type(x) == torch.autograd.Variable
-# False for post-0.4
-def tensor_is_float_tensor():
-    x = torch.Tensor()
-    return type(x) == torch.FloatTensor
-# Akin to `torch.is_tensor`, but returns True for Variable
-# objects in pre-0.4.
-def is_tensor_like(x):
-    return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
-# Wraps `torch.is_floating_point` if present, otherwise checks
-# the suffix of `x.type()`.
-def is_floating_point(x):
-    if hasattr(torch, 'is_floating_point'):
-        return torch.is_floating_point(x)
-    try:
-        torch_type = x.type()
-        return torch_type.endswith('FloatTensor') or \
-            torch_type.endswith('HalfTensor') or \
-            torch_type.endswith('DoubleTensor')
-    except AttributeError:
-        return False
-def scalar_python_val(x):
-    if hasattr(x, 'item'):
-        return x.item()
-    else:
-        if isinstance(x, torch.autograd.Variable):
-            return x.data[0]
-        else:
-            return x[0]
-# Accounts for the possibility that some ops may be removed from a namespace.
-def filter_attrs(module, attrs):
-    return list(attrname for attrname in attrs if hasattr(module, attrname))

apex/apex/amp/frontend.py DELETED Viewed

@@ -1,446 +0,0 @@
-from collections import OrderedDict
-import torch
-from ._initialize import _initialize
-from ._amp_state import _amp_state, warn_or_err, maybe_print
-class Properties(object):
-    """
-    This class has two purposes: to establish a set of default properties,
-    and to route setting of these attributes through __setattr__ so that (in theory)
-    they can be checked for consistency with other existing args.
-    """
-    def __init__(self):
-        self.options = {
-            "enabled" : False,
-            "opt_level" : None,
-            "cast_model_type" : None,
-            "patch_torch_functions" : False,
-            "keep_batchnorm_fp32" : None,
-            "master_weights" : None,
-            "loss_scale" : 1.0,
-            # Reserved for future functionality
-            # "fused_optimizer" : False,
-            # "enable_ddp_interop" : False,
-            }
-    """
-    This function allows updating several options at a time without routing through
-    __setattr__ checks, to avoid "you can't get there from here" scenarios.
-    Currently not intended to be exposed; users are expected to select an opt_level
-    and apply consistent modifications.
-    """
-    def _update_options_dict(self, new_options):
-        for k, v in new_options:
-            if k in self.options:
-                self.options[k] = v
-            else:
-                raise ValueError("Tried to set unexpected option {}".format(k))
-    """
-    The members of "options" are not direct attributes of self, so access attempts
-    will roll down to __getattr__.  This borrows from the logic in torch.nn.Module.
-    """
-    def __getattr__(self, name):
-        if "options" in self.__dict__:
-            options =  self.__dict__["options"]
-            if name in options:
-                return options[name]
-        raise AttributeError("'{}' object has no attribute '{}'".format(
-            type(self).__name__, name))
-    def __setattr__(self, name, value):
-        if "options" in self.__dict__:
-            if name in self.options:
-                # print("setting {} {}".format(name, value))
-                if name == "cast_model_type":
-                    if self.opt_level == "O1" and value is not None:
-                        if value is not False:
-                            if value is not torch.float32:
-                                warn_or_err("O1 inserts casts around Torch functions rather than "
-                                            "model weights, so with O1, the model weights themselves "
-                                            "should remain FP32. If you wish to cast the model to a "
-                                            "different type, use opt_level='O2' or 'O3'. " +
-                                            "cast_model_type was {}".format(value))
-                    self.options[name] = value
-                elif name == "patch_torch_functions":
-                    if self.opt_level != "O1" and value:
-                        warn_or_err("Currently, patch_torch_functions=True should only be set by "
-                                    "selecting opt_level='O1'.")
-                    self.options[name] = value
-                elif name == "keep_batchnorm_fp32":
-                    if self.opt_level == "O1" and value is not None:
-                        warn_or_err("With opt_level O1, batchnorm functions are automatically patched "
-                                    "to run in FP32, so keep_batchnorm_fp32 should be None." +
-                                    " keep_batchnorm_fp32 was {}".format(value))
-                    if value == "False":
-                        self.options[name] = False
-                    elif value == "True":
-                        self.options[name] = True
-                    else:
-                        assert (value is True or value is False or value is None),\
-                            "keep_batchnorm_fp32 must be a boolean, the string 'True' or 'False', "\
-                            "or None, found keep_batchnorm_fp32={}".format(value)
-                        self.options[name] = value
-                elif name == "master_weights":
-                    if self.opt_level == "O1" and value is not None:
-                        warn_or_err("It doesn't make sense to use master_weights with O1. "
-                                    "With O1, your model weights themselves should be FP32.")
-                    self.options[name] = value
-                elif name == "loss_scale":
-                    if value == "dynamic":
-                        self.options[name] = value
-                    else:
-                        self.options[name] = float(value)
-                else:
-                    self.options[name] = value
-        else:
-            super(Properties, self).__setattr__(name, value)
-""" O0-O3 are convenience wrappers to establish defaults for typically used mixed precision options. """
-class O3:
-    brief = "O3:  Pure FP16 training."
-    more = "Calls .half() on your model, converting the entire model to FP16.\n"\
-        "A casting operation is also inserted to cast incoming Tensors to FP16,\n"\
-        "so you don't need to change your data pipeline.\n"\
-        "This mode is useful for establishing a performance ceiling.\n"\
-        "It's also possible training may 'just work' in this mode.\n"\
-        "If not, try other optimization levels."
-    def __call__(self, properties):
-        properties.enabled = True
-        properties.opt_level = "O3"
-        properties.cast_model_type = torch.float16
-        properties.patch_torch_functions = False
-        properties.keep_batchnorm_fp32 = False
-        properties.master_weights = False
-        properties.loss_scale = 1.0
-        # properties.fused_optimizer = False
-        # properties.enable_ddp_interop = False
-        return properties # modified in place so this isn't really necessary
-class O2:
-    brief = "O2:  FP16 training with FP32 batchnorm and FP32 master weights.\n"
-    more = "Calls .half() on your model, converting the entire model (except for batchnorms)\n"\
-        "to FP16.  Batchnorms are retained in FP32 for additional stability.\n"\
-        "The forward pass is patched to cast incoming Tensors to FP16, so you don't need to change\n"\
-        "your data pipeline.\n"\
-        "O2 creates FP32 master weights outside the model and patches any optimizers to update\n"\
-        "these master weights, then copy the master weights into the FP16 model weights.\n"\
-        "Master weights can also improve convergence and stability."
-    def __call__(self, properties):
-        properties.enabled = True
-        properties.opt_level = "O2"
-        properties.cast_model_type = torch.float16
-        properties.patch_torch_functions = False
-        properties.keep_batchnorm_fp32 = True
-        properties.master_weights = True
-        properties.loss_scale = "dynamic"
-        # properties.fused_optimizer = False
-        # properties.enable_ddp_interop = False
-        return properties # modified in place so this isn't really necessary
-class O1:
-    brief = "O1:  Insert automatic casts around Pytorch functions and Tensor methods.\n"
-    more = "The type of your model's weights is not altered.  However, internally,\n"\
-        "Pytorch functions are patched to cast any Tensor Core-friendly ops to FP16 for speed,\n"\
-        "while operations that might benefit from the additional stability of FP32 are patched\n"\
-        "to cast their inputs to fp32.\n"\
-        "O1 is the safest way to try mixed precision training, and is recommended when\n"\
-        "trying mixed precision training for the first time."
-    def __call__(self, properties):
-        properties.enabled = True
-        properties.opt_level = "O1"
-        properties.cast_model_type = None
-        properties.patch_torch_functions = True
-        properties.keep_batchnorm_fp32 = None
-        properties.master_weights = None
-        properties.loss_scale = "dynamic"
-        # properties.fused_optimizer = False
-        # properties.enable_ddp_interop = False
-        return properties # modified in place so this isn't really necessary
-class O0:
-    brief = "O0:  Pure FP32 training.\n"
-    more = "Your models are checked to make sure parameters are FP32, but otherwise the\n"\
-        "types of weights and internal Pytorch operations are not altered.  This mode disables any\n"\
-        "FP16 arithmetic, although other optimizations like DDP interop may still be requested.\n"
-    def __call__(self, properties):
-        properties.enabled = True
-        properties.opt_level = "O0"
-        properties.cast_model_type = torch.float32
-        properties.patch_torch_functions = False
-        properties.keep_batchnorm_fp32 = None
-        properties.master_weights = False
-        properties.loss_scale = 1.0
-        # properties.fused_optimizer = False
-        # properties.enable_ddp_interop = False
-        return properties # modified in place so this isn't really necessary
-opt_levels = {"O3": O3(),
-              "O2": O2(),
-              "O1": O1(),
-              "O0": O0()}
-# allow user to directly pass Properties struct as well?
-def initialize(
-    models,
-    optimizers=None,
-    enabled=True,
-    opt_level="O1",
-    cast_model_type=None,
-    patch_torch_functions=None,
-    keep_batchnorm_fp32=None,
-    master_weights=None,
-    loss_scale=None,
-    cast_model_outputs=None,
-    num_losses=1,
-    verbosity=1,
-    min_loss_scale=None,
-    max_loss_scale=2.**24
-    ):
-    """
-    Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
-    chosen ``opt_level`` and overridden properties, if any.
-    ``amp.initialize`` should be called **after** you have finished
-    constructing your model(s) and
-    optimizer(s), but **before** you send your model through any DistributedDataParallel wrapper.
-    See `Distributed training`_ in the Imagenet example.
-    Currently, ``amp.initialize`` should only be called **once**,
-    although it can process an arbitrary number of
-    models and optimizers (see the corresponding `Advanced Amp Usage topic`_).
-    If you think your use case requires ``amp.initialize`` to be called more than once,
-    `let us know`_.
-    Any property keyword argument that is not ``None`` will be interpreted as a manual override.
-    To prevent having to rewrite anything else in your script, name the returned models/optimizers
-    to replace the passed models/optimizers, as in the code sample below.
-    Args:
-        models (torch.nn.Module or list of torch.nn.Modules):  Models to modify/cast.
-        optimizers (optional, torch.optim.Optimizer or list of torch.optim.Optimizers):  Optimizers to modify/cast.
-            REQUIRED for training, optional for inference.
-        enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops, so your script
-            should run as if Amp were not present.
-        opt_level (str, optional, default="O1"):  Pure or mixed precision optimization level.  Accepted values are
-            "O0", "O1", "O2", and "O3", explained in detail above.
-        cast_model_type (``torch.dtype``, optional, default=None):  Optional property override, see
-            above.
-        patch_torch_functions (bool, optional, default=None):  Optional property override.
-        keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.  If
-            passed as a string, must be the string "True" or "False".
-        master_weights (bool, optional, default=None):  Optional property override.
-        loss_scale (float or str, optional, default=None):  Optional property override.  If passed as a string,
-            must be a string representing a number, e.g., "128.0", or the string "dynamic".
-        cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that the outputs
-            of your model(s) are always cast to a particular type regardless of ``opt_level``.
-        num_losses (int, optional, default=1):  Option to tell Amp in advance how many losses/backward
-            passes you plan to use.  When used in conjunction with the ``loss_id`` argument to
-            ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass,
-            which can improve stability.  See "Multiple models/optimizers/losses"
-            under `Advanced Amp Usage`_ for examples.  If ``num_losses`` is left to 1, Amp will still
-            support multiple losses/backward passes, but use a single global loss scale
-            for all of them.
-        verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
-        min_loss_scale (float, default=None):  Sets a floor for the loss scale values that can be chosen by dynamic
-            loss scaling.  The default value of None means that no floor is imposed.
-            If dynamic loss scaling is not used, `min_loss_scale` is ignored.
-        max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values that can be chosen by
-            dynamic loss scaling.  If dynamic loss scaling is not used, `max_loss_scale` is ignored.
-    Returns:
-        Model(s) and optimizer(s) modified according to the ``opt_level``.
-        If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
-        also be a list.
-    Permissible invocations::
-        model, optim = amp.initialize(model, optim,...)
-        model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
-        [model1, model2], optim = amp.initialize([model1, model2], optim,...)
-        [model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
-        # This is not an exhaustive list of the cross product of options that are possible,
-        # just a set of examples.
-        model, optim = amp.initialize(model, optim, opt_level="O0")
-        model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")
-        model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
-        model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")
-        model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
-        model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
-        model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")
-        model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
-        model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
-        model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")
-    The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.
-    .. _`Distributed training`:
-        https://github.com/NVIDIA/apex/tree/master/examples/imagenet#distributed-training
-    .. _`Imagenet example`:
-        https://github.com/NVIDIA/apex/tree/master/examples/imagenet
-    .. _`Advanced Amp Usage`:
-        https://nvidia.github.io/apex/advanced.html
-    .. _`Advanced Amp Usage topic`:
-        https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses
-    .. _`let us know`:
-        https://github.com/NVIDIA/apex/issues
-    """
-    from apex import deprecated_warning
-    deprecated_warning("apex.amp is deprecated and will be removed by the end of February 2023. Use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)")
-    _amp_state.opt_properties = Properties()
-    _amp_state.verbosity = verbosity
-    if not enabled:
-        if optimizers is None:
-            return models
-        else:
-            return models, optimizers
-    if not torch.backends.cudnn.enabled:
-        raise RuntimeError(
-            "Amp requires torch.backends.cudnn.enabled = True")
-    if opt_level not in opt_levels:
-        raise RuntimeError(
-            "Unexpected optimization level {}. ".format(opt_level) +
-            "Options are 'O0', 'O1', 'O2', 'O3'.  Note that in `O0`, `O1`, etc., the prefix O is the letter O, " +
-            "not the number zero.")
-    else:
-        _amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
-        maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
-        maybe_print("Defaults for this optimization level are:", True)
-        for k, v in _amp_state.opt_properties.options.items():
-            maybe_print("{:22} : {}".format(k, v), True)
-    _amp_state.min_loss_scale = min_loss_scale
-    _amp_state.max_loss_scale = max_loss_scale
-    maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
-    # I chose to have the keyword arguments listed directly in the argument list,
-    # instead of **kwargs, so I can't use kwargs.items() here.
-    if enabled is not None:
-        _amp_state.opt_properties.enabled = enabled
-    if opt_level is not None:
-        _amp_state.opt_properties.opt_level = opt_level
-    if cast_model_type is not None:
-        _amp_state.opt_properties.cast_model_type = cast_model_type
-    if patch_torch_functions is not None:
-        _amp_state.opt_properties.patch_torch_functions = patch_torch_functions
-    if keep_batchnorm_fp32 is not None:
-        _amp_state.opt_properties.keep_batchnorm_fp32 = keep_batchnorm_fp32
-    if master_weights is not None:
-        _amp_state.opt_properties.master_weights = master_weights
-    if loss_scale is not None:
-        _amp_state.opt_properties.loss_scale = loss_scale
-    maybe_print("After processing overrides, optimization options are:", True)
-    for k, v in _amp_state.opt_properties.options.items():
-        maybe_print("{:22} : {}".format(k, v), True)
-    return _initialize(models, optimizers, _amp_state.opt_properties, num_losses, cast_model_outputs)
-def state_dict(destination=None):
-    if destination is None:
-        destination = OrderedDict()
-    for idx, loss_scaler in enumerate(_amp_state.loss_scalers):
-        destination['loss_scaler%d' % idx] = {
-            'loss_scale': loss_scaler.loss_scale(),
-            'unskipped': loss_scaler._unskipped,
-        }
-    return destination
-def load_state_dict(state_dict):
-    # Check if state_dict containes the same number of loss_scalers as current setup
-    if len(state_dict) != len(_amp_state.loss_scalers):
-        print('Warning: state_dict contains {} entries, while {} loss_scalers are used'.format(
-            len(state_dict), len(_amp_state.loss_scalers)))
-    state_dict = state_dict.copy()
-    nb_loss_scalers = len(_amp_state.loss_scalers)
-    unexpected_keys = []
-    # Initialize idx outside, since unexpected_keys will increase it if enumerate is used
-    idx = 0
-    for key in state_dict:
-        if 'loss_scaler' not in key:
-            unexpected_keys.append(key)
-        else:
-            if idx > (nb_loss_scalers - 1):
-                print('Skipping loss_scaler[{}], since num_losses was set to {}'.format(
-                    idx, nb_loss_scalers))
-                break
-            _amp_state.loss_scalers[idx]._loss_scale = state_dict[key]['loss_scale']
-            _amp_state.loss_scalers[idx]._unskipped = state_dict[key]['unskipped']
-            idx += 1
-    if len(unexpected_keys) > 0:
-        raise RuntimeError(
-            'Error(s) in loading state_dict. Unexpected key(s) in state_dict: {}. '.format(
-                ', '.join('"{}"'.format(k) for k in unexpected_keys)))
-# TODO:  is this necessary/useful?
-# def check_option_consistency(enabled=True,
-#                              opt_level=None,
-#                              cast_model_type=None,
-#                              patch_torch_functions=None,
-#                              keep_batchnorm_fp32=None,
-#                              master_weights=None,
-#                              loss_scale=None,
-#                              enable_ddp_interop=None,
-#                              hard_override=False):
-#     """
-#     Utility function that enables users to quickly check if the option combination they intend
-#     to use is permitted.  ``check_option_consistency`` does not require models or optimizers
-#     to be constructed, and can be called at any point in the script.  ``check_option_consistency``
-#     is totally self-contained; it does not set any amp global state or affect anything outside
-#     of itself.
-#     """
-#
-#     if not enabled:
-#         return
-#
-#     if opt_level not in opt_levels:
-#         raise RuntimeError("Unexpected optimization level.  Options are 'O0', 'O1', 'O2', 'O3'.")
-#     else:
-#         opt_properties = opt_levels[opt_level](Properties())
-#         print("Selected optimization level {}", opt_levels[opt_level].brief)
-#         print("Defaults for this optimization level are:")
-#         for k, v in opt_properties.options:
-#             print("{:22} : {}".format(k, v))
-#
-#     print("Processing user overrides (additional kwargs that are not None)...")
-#     for k, v in kwargs:
-#         if k not in _amp_state.opt_properties.options:
-#             raise RuntimeError("Unexpected kwarg {}".format(k))
-#         if v is not None:
-#             setattr(opt_properties, k, v)
-#
-#     print("After processing overrides, optimization options are:")
-#     for k, v in opt_properties.options:
-#         print("{:22} : {}".format(k, v))

apex/apex/amp/handle.py DELETED Viewed

@@ -1,281 +0,0 @@
-import contextlib
-import warnings
-import sys
-import torch
-from . import utils
-from .opt import OptimWrapper
-from .scaler import LossScaler
-from ._amp_state import _amp_state, master_params, maybe_print
-if torch.distributed.is_available():
-    from ..parallel.LARC import LARC
-# There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
-@contextlib.contextmanager
-def scale_loss(loss,
-               optimizers,
-               loss_id=0,
-               model=None,
-               delay_unscale=False,
-               delay_overflow_check=False):
-    """
-    On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
-    ``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::
-        with amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-    On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
-    and unscaled, so that ``optimizer.step()`` can be called.
-    .. note::
-        If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
-        can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
-        any FP16 gradients are copied to FP32 master gradients before being unscaled.
-        ``optimizer.step()`` will then apply the unscaled master gradients to the master params.
-    .. warning::
-        If Amp is using explicit FP32 master params, only the FP32 master gradients will be
-        unscaled.  The direct ``.grad`` attributes of any FP16
-        model params will remain scaled after context manager exit.
-        This subtlety affects gradient clipping.  See "Gradient clipping" under
-        `Advanced Amp Usage`_ for best practices.
-    Args:
-        loss(Tensor):  Typically a scalar Tensor. The ``scaled_loss`` that the context
-            manager yields is simply ``loss.float()*loss_scale``, so in principle
-            ``loss`` could have more than one element, as long as you call
-            ``backward()`` on ``scaled_loss`` appropriately within the context manager body.
-        optimizers:  All optimizer(s) for which the current backward pass is creating gradients.
-            Must be an optimizer or list of optimizers returned from an earlier call
-            to ``amp.initialize``.  For example use with multiple optimizers, see
-            "Multiple models/optimizers/losses" under `Advanced Amp Usage`_.
-        loss_id(int, optional, default=0):  When used in conjunction with the ``num_losses`` argument
-            to ``amp.initialize``, enables Amp to use a different loss scale per loss.  ``loss_id``
-            must be an integer between 0 and ``num_losses`` that tells Amp which loss is
-            being used for the current backward pass.  See "Multiple models/optimizers/losses"
-            under `Advanced Amp Usage`_ for examples.  If ``loss_id`` is left unspecified, Amp
-            will use the default global loss scaler for this backward pass.
-        model(torch.nn.Module, optional, default=None):  Currently unused, reserved to enable future
-            optimizations.
-        delay_unscale(bool, optional, default=False):  ``delay_unscale`` is never necessary, and
-            the default value of ``False`` is strongly recommended.
-            If ``True``, Amp will not unscale the gradients or perform model->master
-            gradient copies on context manager exit.
-            ``delay_unscale=True`` is a minor ninja performance optimization and can result
-            in weird gotchas (especially with multiple models/optimizers/losses),
-            so only use it if you know what you're doing.
-            "Gradient accumulation across iterations" under `Advanced Amp Usage`_
-            illustrates a situation where this CAN (but does not need to) be used.
-    .. warning::
-        If ``delay_unscale`` is ``True`` for a given backward pass, ``optimizer.step()`` cannot be
-        called yet after context manager exit, and must wait for another, later backward context
-        manager invocation with ``delay_unscale`` left to False.
-    .. _`Advanced Amp Usage`:
-        https://nvidia.github.io/apex/advanced.html
-    """
-    if not hasattr(_amp_state, "opt_properties"):
-        raise RuntimeError("Invoked 'with amp.scale_loss`, but internal Amp state has not been initialized.  "
-                           "model, optimizer = amp.initialize(model, optimizer, opt_level=...) must be called "
-                           "before `with amp.scale_loss`.")
-    if not _amp_state.opt_properties.enabled:
-        yield loss
-        return
-    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
-        optimizers = [optimizers]
-    loss_scaler = _amp_state.loss_scalers[loss_id]
-    loss_scale = loss_scaler.loss_scale()
-    if ((not _amp_state.opt_properties.master_weights)
-        and (not loss_scaler.dynamic)
-        and loss_scale == 1.0):
-        yield loss.float()
-        # Needing to drop the cache here as well is an ugly gotcha.
-        # But for now I think it's necessary to short-circuit.
-        # Probably ok to skip this if not delay_unscale
-        if _amp_state.opt_properties.patch_torch_functions:
-            _amp_state.handle._clear_cache()
-        return
-    if not delay_unscale:
-        if isinstance(optimizers, list):
-            for optimizer in optimizers:
-                if not optimizer._amp_stash.params_have_scaled_gradients:
-                    optimizer._prepare_amp_backward()
-    yield (loss.float())*loss_scale
-    if delay_unscale:
-        for optimizer in optimizers:
-            optimizer._amp_stash.params_have_scaled_gradients = True
-    else:
-        # FusedSGD may take care of unscaling as part of their step() methods.
-        # if not isinstance(optimizers, FP16_Optimizer_for_fused):
-            loss_scaler.clear_overflow_state()
-            for optimizer in optimizers:
-                optimizer._post_amp_backward(loss_scaler)
-                optimizer._amp_stash.params_have_scaled_gradients = False
-            # For future fused optimizers that enable sync-free dynamic loss scaling,
-            # should_skip will always be False.
-            should_skip = False if delay_overflow_check else loss_scaler.update_scale()
-            if should_skip:
-                for optimizer in optimizers:
-                    if not optimizer._amp_stash.already_patched:
-                        # Close on loss_scaler and loss_id as well, to be safe.  Probably not
-                        # necessary because amp.scale_loss is already creating a temporary scope.
-                        def patch_step(opt, loss_scaler, loss_id):
-                            opt_step = opt.step
-                            def skip_step(closure=None):
-                                if closure is not None:
-                                    raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
-                                maybe_print(("Gradient overflow.  Skipping step, loss scaler " +
-                                             "{} reducing loss scale to {}").format(loss_id,
-                                             loss_scaler.loss_scale()))
-                                # TODO:  I don't like the special casing for different optimizer implementations.
-                                # Maybe skip should delegate to a method owned by the optimizers themselves.
-                                if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
-                                    # Clear the master grads that wouldn't be zeroed by model.zero_grad()
-                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
-                                        param.grad = None
-                                if hasattr(opt, "most_recent_scale"):
-                                    opt.most_recent_scale = 1.0
-                                    opt.scale_set_by_backward = False
-                                opt.step = opt_step
-                                opt._amp_stash.already_patched = False
-                            return skip_step
-                        optimizer.step = patch_step(optimizer, loss_scaler, loss_id)
-                        optimizer._amp_stash.already_patched = True
-    # Probably ok to skip this if not delay_unscale
-    if _amp_state.opt_properties.patch_torch_functions:
-        _amp_state.handle._clear_cache()
-# Free function version of AmpHandle.disable_casts, another step on the
-# path to removing the concept of "AmpHandle"
-@contextlib.contextmanager
-def disable_casts():
-    _amp_state.handle._is_active = False
-    yield
-    _amp_state.handle._is_active = True
-class AmpHandle(object):
-    def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=False):
-        self._enable_caching = enable_caching
-        self._verbose = verbose
-        self._cache = dict()
-        self._default_scaler = LossScaler(loss_scale)
-        self._is_active = True
-        self._all_wrappers = []
-    def is_active(self):
-        return self._is_active
-    @contextlib.contextmanager
-    def _disable_casts(self):
-        self._is_active = False
-        yield
-        self._is_active = True
-    def wrap_optimizer(self, optimizer, num_loss=1):
-        self._default_scaler = None
-        return OptimWrapper(optimizer, self, num_loss)
-    @contextlib.contextmanager
-    def scale_loss(self, loss, optimizer):
-        raise RuntimeError("The old Amp API is no longer supported.  Please move to the new API, "
-            "documented here:  https://nvidia.github.io/apex/amp.html.  Transition guide:  "
-            "https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users")
-        if not self.is_active():
-            yield loss
-            return
-        if self._default_scaler is None:
-            raise RuntimeError(
-                'After calling `handle.wrap_optimizer()`, you must explicitly ' +
-                'use `optimizer.scale_loss(loss)`.')
-        # TODO: this code block is duplicated here and `opt.py`. Unify.
-        loss_scale = self._default_scaler.loss_scale()
-        yield loss * loss_scale
-        self._default_scaler.clear_overflow_state()
-        self._default_scaler.unscale(
-            master_params(optimizer),
-            master_params(optimizer),
-            loss_scale)
-        should_skip = self._default_scaler.update_scale()
-        if should_skip:
-            optimizer_step = optimizer.step
-            def skip_step():
-                maybe_print('Gradient overflow, skipping update')
-                optimizer.step = optimizer_step
-            optimizer.step = skip_step
-        self._clear_cache()
-    def _clear_cache(self):
-        self._cache.clear()
-    # Experimental support for saving / restoring uncasted versions of functions
-    def _save_func(self, mod, fn, func):
-        self._all_wrappers.append((mod, fn, func))
-    def _deactivate(self):
-        for mod, fn, func in self._all_wrappers:
-            utils.set_func(mod, fn, func)
-        self._all_wrappers = []
-    @property
-    def has_cache(self):
-        return self._enable_caching
-    @property
-    def cache(self):
-        return self._cache
-    def remove_cache(self, param):
-        if self.has_cache and param in self.cache:
-            del self.cache[param]
-    @property
-    def verbose(self):
-        return self._verbose
-class NoOpHandle(object):
-    def is_active(self):
-        return False
-    @contextlib.contextmanager
-    def _disable_casts(self):
-        yield
-    def wrap_optimizer(self, optimizer, num_loss=1):
-        return OptimWrapper(optimizer, self, num_loss)
-    @contextlib.contextmanager
-    def scale_loss(self, loss, optimizer):
-        yield loss
-    @property
-    def has_cache(self):
-        return False
-    @property
-    def verbose(self):
-        return False
-    def _clear_cache(self):
-        pass
-    def _deactivate(self):
-        pass

apex/apex/amp/lists/__init__.py DELETED Viewed

File without changes

apex/apex/amp/lists/functional_overrides.py DELETED Viewed

@@ -1,80 +0,0 @@
-# TODO: think about the following two. They do weird things.
-# - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
-# - torch.nn.utils.weight_norm
-# Notes:
-# F.instance_norm uses batch_norm internally. Which correctly handles
-#   fp16 in/out with fp32 weights. So we shouldn't do anything for
-#   either of these.
-# F.normalize calls `input.norm()` internally, so it's redundant, but
-#   kept here in case impl. changes.
-# F.cosine_similarity is same: calls `x.norm()` internally.
-import torch.nn.functional
-MODULE = torch.nn.functional
-FP16_FUNCS = [
-    'conv1d',
-    'conv2d',
-    'conv3d',
-    'conv_transpose1d',
-    'conv_transpose2d',
-    'conv_transpose3d',
-    'conv_tbc', # Undocumented / maybe new?
-    'linear',
-]
-FP32_FUNCS = [
-    # Interpolation/Upsampling TODO:  Remove for 1.2
-    'interpolate',
-    'grid_sample',
-    # Pointwise
-    'softplus',
-    'softmin',
-    'log_softmax',
-    'softmax',
-    'gelu',
-    # Normalization
-    'layer_norm',
-    'group_norm',
-    'local_response_norm',
-    'normalize',
-    'cosine_similarity',
-    # Loss functions
-    # TODO: which of these can be fp16?
-    'poisson_nll_loss',
-    'cosine_embedding_loss',
-    'cross_entropy',
-    'hinge_embedding_loss',
-    'kl_div',
-    'l1_loss',
-    'mse_loss',
-    'margin_ranking_loss',
-    'multilabel_margin_loss',
-    'multilabel_soft_margin_loss',
-    'multi_margin_loss',
-    'nll_loss',
-    'binary_cross_entropy_with_logits',
-    'smooth_l1_loss',
-    'soft_margin_loss',
-    'triplet_margin_loss',
-    'ctc_loss'
-]
-BANNED_FUNCS = [
-    ('binary_cross_entropy',
-     ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
-      "It requires that the output of the previous function be already a FloatTensor. \n\n"
-      "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
-      "    torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
-      "that is compatible with amp.\nAnother option is to add\n"
-      "    amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
-      "If you _really_ know what you are doing, you can disable this warning by passing "
-      "allow_banned=True to `amp.init()`."))
-]

apex/apex/amp/lists/tensor_overrides.py DELETED Viewed

@@ -1,63 +0,0 @@
-from .. import compat
-from . import torch_overrides
-import importlib
-import torch
-# if compat.variable_is_tensor() and not compat.tensor_is_variable():
-MODULE = torch.Tensor
-# else:
-#     MODULE = torch.autograd.Variable
-FP16_FUNCS = compat.filter_attrs(MODULE, [
-    '__matmul__',
-])
-FP32_FUNCS = compat.filter_attrs(MODULE, [
-    '__ipow__',
-    '__pow__',
-    '__rpow__',
-    # Cast to fp32 before transfer to CPU
-    'cpu',
-])
-CASTS = compat.filter_attrs(MODULE, [
-    '__add__',
-    '__div__',
-    '__eq__',
-    '__ge__',
-    '__gt__',
-    '__iadd__',
-    '__idiv__',
-    '__imul__',
-    '__isub__',
-    '__itruediv__',
-    '__le__',
-    '__lt__',
-    '__mul__',
-    '__ne__',
-    '__radd__',
-    '__rdiv__',
-    '__rmul__',
-    '__rsub__',
-    '__rtruediv__',
-    '__sub__',
-    '__truediv__',
-])
-# None of these, but here to make code cleaner.
-SEQUENCE_CASTS = []
-# We need to grab all the methods from torch_overrides and add them to
-# the Tensor lists as well, as almost all methods are duplicated
-# between `torch` and `torch.Tensor` (and check with `hasattr`,
-# because a few random ones aren't defined on Tensor)
-_self_mod = importlib.import_module(__name__)
-for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
-    lst = getattr(_self_mod, attrname)
-    for fn in getattr(torch_overrides, attrname):
-        if hasattr(MODULE, fn):
-            lst.append(fn)

apex/apex/amp/lists/torch_overrides.py DELETED Viewed

@@ -1,115 +0,0 @@
-import torch
-from .. import utils
-MODULE = torch
-FP16_FUNCS = [
-    # Low level functions wrapped by torch.nn layers.
-    # The wrapper layers contain the weights which are then passed in as a parameter
-    # to these functions.
-    'conv1d',
-    'conv2d',
-    'conv3d',
-    'conv_transpose1d',
-    'conv_transpose2d',
-    'conv_transpose3d',
-    'conv_tbc',
-    'prelu',
-    # BLAS
-    'addmm',
-    'addmv',
-    'addr',
-    'matmul',
-    'mm',
-    'mv',
-]
-FP32_FUNCS = [
-    # Pointwise
-    'acos',
-    'asin',
-    'cosh',
-    'erfinv',
-    'exp',
-    'expm1',
-    'log',
-    'log10',
-    'log2',
-    'reciprocal',
-    'rsqrt',
-    'sinh',
-    'tan',
-    # Other math
-    'pow',
-    # Reduction
-    'cumprod',
-    'cumsum',
-    'dist',
-    # 'mean',
-    'norm',
-    'prod',
-    'std',
-    'sum',
-    'var',
-    # Misc
-    'renorm'
-]
-version_strings = torch.__version__.split('.')
-version_major = version_strings[0]
-version_minor = version_strings[1]
-version_num = float(version_major + "." + version_minor)
-# Before torch 1.1, mean must be blacklisted.
-if version_num < 1.1:
-    FP32_FUNCS.append('mean')
-# Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
-# check the CUDA version -- if at least 9.1, then put the bmm
-# functions on the fp16 list. Otherwise, put them on the fp32 list.
-_bmms = ['addbmm',
-         'baddbmm',
-         'bmm']
-if utils.is_cuda_enabled():
-  # workaround https://github.com/facebookresearch/maskrcnn-benchmark/issues/802
-  if utils.get_cuda_version() >= (9, 1, 0):
-      FP16_FUNCS.extend(_bmms)
-  else:
-      FP32_FUNCS.extend(_bmms)
-# Multi-tensor fns that may need type promotion
-CASTS = [
-    # Multi-tensor math
-    'addcdiv',
-    'addcmul',
-    'atan2',
-    'cross',
-    'bilinear',
-    'dot',
-    # Element-wise _or_ tensor-wise math
-    'add',
-    'div',
-    'mul',
-    # Comparison
-    'eq',
-    'equal',
-    'ge',
-    'gt',
-    'le',
-    'lt',
-    'ne'
-]
-# Functions that take sequence arguments. We need to inspect the whole
-# sequence and cast to the widest type.
-SEQUENCE_CASTS = [
-    'cat',
-    'stack'
-]

apex/apex/amp/opt.py DELETED Viewed

@@ -1,103 +0,0 @@
-import contextlib
-import warnings
-from .scaler import LossScaler, master_params
-from ._amp_state import maybe_print
-import numpy as np
-class OptimWrapper(object):
-    def __init__(self, optimizer, amp_handle, num_loss):
-        self._optimizer = optimizer
-        self._amp_handle = amp_handle
-        self._num_loss = num_loss
-        self._loss_idx = 0
-        self._skip_next = [False] * num_loss
-        self._loss_scaler = [LossScaler('dynamic') for _ in range(num_loss)]
-    @contextlib.contextmanager
-    def scale_loss(self, loss):
-        if not self._amp_handle.is_active():
-            yield loss
-            return
-        # When there are multiple losses per-optimizer, we need
-        # to save out current grad accumulation, since we won't be
-        # able to unscale this particulare loss once the grads are
-        # all mixed together.
-        cached_grads = []
-        if self._loss_idx > 0:
-            for p in master_params(self._optimizer):
-                if p.grad is not None:
-                    cached_grads.append(p.grad.data.detach().clone())
-                else:
-                    cached_grads.append(None)
-            self._optimizer.zero_grad()
-        loss_scale = self._cur_loss_scaler().loss_scale()
-        yield loss * loss_scale
-        self._cur_loss_scaler().clear_overflow_state()
-        self._cur_loss_scaler().unscale(
-            master_params(self._optimizer),
-            master_params(self._optimizer),
-            loss_scale)
-        self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
-        self._loss_idx += 1
-        if len(cached_grads) > 0:
-            for p, cached_grad in zip(master_params(self._optimizer),
-                                      cached_grads):
-                if cached_grad is not None:
-                    p.grad.data.add_(cached_grad)
-            cached_grads = []
-    def _cur_loss_scaler(self):
-        assert 0 <= self._loss_idx < self._num_loss
-        return self._loss_scaler[self._loss_idx]
-    def step(self, closure=None):
-        if not self._amp_handle.is_active():
-            return self._optimizer.step(closure=closure)
-        self._loss_idx = 0
-        for group in self._optimizer.param_groups:
-            for p in group['params']:
-                self._amp_handle.remove_cache(p)
-        if closure is not None:
-            raise NotImplementedError(
-                'The `closure` argument is unsupported by the amp ' +
-                'optimizer wrapper.')
-        if any(self._skip_next):
-            maybe_print('Gradient overflow, skipping update')
-            self._skip_next = [False] * self._num_loss
-        else:
-            return self._optimizer.step(closure=closure)
-    # Forward any attribute lookups
-    def __getattr__(self, attr):
-        return getattr(self._optimizer, attr)
-    # Forward all torch.optim.Optimizer methods
-    def __getstate__(self):
-        return self._optimizer.__getstate__()
-    def __setstate__(self):
-        return self._optimizer.__setstate__()
-    def __repr__(self):
-        return self._optimizer.__repr__()
-    def state_dict(self):
-        return self._optimizer.state_dict()
-    def load_state_dict(self, state_dict):
-        return self._optimizer.load_state_dict(state_dict)
-    def zero_grad(self):
-        return self._optimizer.zero_grad()
-    def add_param_group(self, param_group):
-        return self._optimizer.add_param_group(param_group)

apex/apex/amp/rnn_compat.py DELETED Viewed

@@ -1,53 +0,0 @@
-from . import utils, wrap
-import torch
-_VF = torch._C._VariableFunctions
-RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
-def _gen_VF_wrapper(name):
-    def wrapper(*args, **kwargs):
-        return getattr(_VF, name)(*args, **kwargs)
-    return wrapper
-# Some python magic to generate an object that has the rnn cell functions
-# defined on it, all of which call into corresponding _VF version.
-# Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
-# imported at module scope within torch.nn.modules.rnn).  This should
-# not affect third-party importers of _VF.py.
-class VariableFunctionsShim(object):
-    def __init__(self):
-        for name in RNN_NAMES:
-            for suffix in ['', '_cell']:
-               fn_name = name + suffix
-               setattr(self, fn_name, _gen_VF_wrapper(fn_name))
-def has_old_rnns():
-    try:
-        torch.nn.backends.thnn.backend.LSTMCell
-        return True
-    except:
-        return False
-def whitelist_rnn_cells(handle, verbose):
-    # Different module + function names in old/new RNN cases
-    if has_old_rnns():
-        fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
-        mod = torch.nn.backends.thnn.backend
-    else:
-        fn_names = [x + '_cell' for x in RNN_NAMES]
-        mod = torch.nn.modules.rnn._VF
-        assert isinstance(mod, VariableFunctionsShim)
-    # Insert casts on cell functions
-    for fn in fn_names:
-        wrap.cached_cast(mod, fn, utils.maybe_half, handle,
-                         try_caching=True, verbose=verbose)
-    if has_old_rnns():
-        # Special handling of `backward` for fused gru / lstm:
-        # The `backward` method calls Tensor.sum() (blacklist) internally,
-        # and then the resulting grad_input has the wrong type.
-        # TODO: where else is this a problem?
-        for rnn_type in ['GRUFused', 'LSTMFused']:
-            mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
-            wrap.disable_casts(mod, 'backward', handle)

apex/apex/amp/scaler.py DELETED Viewed

@@ -1,217 +0,0 @@
-import torch
-from ..multi_tensor_apply import multi_tensor_applier
-from ._amp_state import _amp_state, master_params, maybe_print
-from itertools import product
-def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
-    # Exception handling for 18.04 compatibility
-    if check_overflow:
-        cpu_sum = float(model_grad.float().sum())
-        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-            return True
-    if master_grad is not model_grad: # copy_ probably internally short-circuits this
-        master_grad.copy_(model_grad)
-    if scale != 1.0:
-        master_grad.mul_(scale)
-    return False
-def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, check_overflow=False):
-    # Exception handling for 18.04 compatibility
-    if check_overflow:
-        cpu_sum = float(model_grad.float().sum())
-        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-            return True
-    # if master_grad is not model_grad: # copy_ probably internally short-circuits this
-    #     master_grad.copy_(model_grad)
-    assert stashed_grad.dtype == master_grad.dtype
-    converted_model_grad = model_grad.data.to(master_grad.dtype)
-    master_grad.data = a*converted_model_grad.data + b*stashed_grad.data
-    return False
-class LossScaler(object):
-    warned_no_fused_kernel = False
-    warned_unscaling_non_fp32_grad = False
-    has_fused_kernel = False
-    def __init__(self,
-                 loss_scale,
-                 init_scale=2.**16,
-                 scale_factor=2.,
-                 scale_window=2000,
-                 min_loss_scale=None,
-                 max_loss_scale=2.**24):
-        if loss_scale == "dynamic":
-            self.dynamic = True
-            self._loss_scale = min(max_loss_scale, init_scale)
-        else:
-            self.dynamic = False
-            self._loss_scale = loss_scale
-        self._max_loss_scale = max_loss_scale
-        self._min_loss_scale = min_loss_scale
-        self._scale_seq_len = scale_window
-        self._unskipped = 0
-        self._has_overflow = False
-        self._overflow_buf = torch.cuda.IntTensor([0])
-        if multi_tensor_applier.available:
-            import amp_C
-            LossScaler.has_fused_kernel = multi_tensor_applier.available
-            LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
-            LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
-        else:
-            if not LossScaler.warned_no_fused_kernel:
-                maybe_print(
-                    "Warning:  multi_tensor_applier fused unscale kernel is unavailable, "
-                    "possibly because apex was installed without --cuda_ext --cpp_ext. "
-                    "Using Python fallback.  Original ImportError was: " +
-                    repr(multi_tensor_applier.import_err),
-                    True)
-            LossScaler.has_fused_kernel = False
-            LossScaler.warned_no_fused_kernel = True
-    def loss_scale(self):
-        return self._loss_scale
-    def unscale_python(self, model_grads, master_grads, scale):
-        for model, master in zip(model_grads, master_grads):
-            if model is not None:
-                if not LossScaler.warned_unscaling_non_fp32_grad:
-                    if master.dtype != torch.float32:
-                        maybe_print(
-                            "Attempting to unscale a grad with type {} ".format(master.type()) +
-                            "Unscaling non-fp32 grads may indicate an error. "
-                            "When using Amp, you don't need to call .half() on your model.")
-                        LossScaler.warned_unscaling_non_fp32_grad = True
-                self._has_overflow = scale_check_overflow_python(model,
-                                                                 master,
-                                                                 1./scale,
-                                                                 self.dynamic)
-                if self._has_overflow and self.dynamic:
-                    break
-    # unused_scale keeps some of the old API alive for hopefully a short time.
-    def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None):
-        if self._has_overflow:
-            return
-        scale = self._loss_scale
-        if scale_override is not None:
-            scale = scale_override
-        if scale == 1.0 and models_are_masters and not self.dynamic:
-            return
-        if LossScaler.has_fused_kernel:
-            # if (not LossScaler.warned_unscaling_non_fp32_grad
-            #     and master_grads[0].dtype == torch.float16):
-            #     print("Warning:  unscaling grads that are not FP32. "
-            #           "Unscaling non-fp32 grads may indicate an error. "
-            #           "When using Amp, you don't need to call .half() on your model.")
-            #     # Setting this to True unconditionally allows the possibility of an escape
-            #     # if never-before-seen non-fp32 grads are created in some later iteration.
-            #     LossScaler.warned_unscaling_non_fp32_grad = True
-            multi_tensor_applier(LossScaler.multi_tensor_scale_cuda,
-                                 self._overflow_buf,
-                                 [model_grads, master_grads],
-                                 1./scale)
-        else:
-            self.unscale_python(model_grads, master_grads, scale)
-        # Defer to update_scale
-        # If the fused kernel is available, we only need one D2H memcopy and sync.
-        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
-        #     self._has_overflow = self._overflow_buf.item()
-    def unscale_with_stashed_python(self,
-                                    model_grads,
-                                    stashed_master_grads,
-                                    master_grads,
-                                    a,
-                                    b):
-        for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
-            if model is None and stashed is None:
-                continue
-            else:
-                if not LossScaler.warned_unscaling_non_fp32_grad:
-                    if master.dtype != torch.float32:
-                        maybe_print(
-                            "Attempting to unscale a grad with type {} ".format(master.type()) +
-                            "Unscaling non-fp32 grads may indicate an error. "
-                            "When using Amp, you don't need to call .half() on your model.")
-                        LossScaler.warned_unscaling_non_fp32_grad = True
-                self._has_overflow = axpby_check_overflow_python(model,
-                                                                 stashed,
-                                                                 master,
-                                                                 a,
-                                                                 b,
-                                                                 self.dynamic)
-                if self._has_overflow and self.dynamic:
-                    break
-    def unscale_with_stashed(self,
-                             model_grads,
-                             stashed_master_grads,
-                             master_grads,
-                             scale_override=None):
-        if self._has_overflow:
-            return
-        grads_have_scale, stashed_have_scale, out_scale = self._loss_scale, 1.0, 1.0
-        if scale_override is not None:
-            grads_have_scale, stashed_have_scale, out_scale = scale_override
-        if LossScaler.has_fused_kernel:
-            if (not LossScaler.warned_unscaling_non_fp32_grad
-                and master_grads[0].dtype == torch.float16):
-                print("Warning:  unscaling grads that are not FP32. "
-                      "Unscaling non-fp32 grads may indicate an error. "
-                      "When using Amp, you don't need to call .half() on your model.")
-                # Setting this to True unconditionally allows the possibility of an escape
-                # if never-before-seen non-fp32 grads are created in some later iteration.
-                LossScaler.warned_unscaling_non_fp32_grad = True
-            multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
-                                 self._overflow_buf,
-                                 [model_grads, stashed_master_grads, master_grads],
-                                 out_scale/grads_have_scale,   # 1./scale,
-                                 out_scale/stashed_have_scale, # 1.0,
-                                 0) # check only arg 0, aka the incoming model grads, for infs
-        else:
-            self.unscale_with_stashed_python(model_grads,
-                                             stashed_master_grads,
-                                             master_grads,
-                                             out_scale/grads_have_scale,
-                                             out_scale/stashed_have_scale)
-        # Defer to update_scale
-        # If the fused kernel is available, we only need one D2H memcopy and sync.
-        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
-        #     self._has_overflow = self._overflow_buf.item()
-    def clear_overflow_state(self):
-        self._has_overflow = False
-        if self.has_fused_kernel:
-            self._overflow_buf.zero_()
-    # Separate so unscale() can be called more that once before updating.
-    def update_scale(self):
-        # If the fused kernel is available, we only need one D2H memcopy and sync.
-        if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
-            self._has_overflow = self._overflow_buf.item()
-        if self._has_overflow and self.dynamic:
-            should_skip = True
-            if(self._min_loss_scale):
-                self._loss_scale = max(self._min_loss_scale, self._loss_scale/2.)
-            else:
-                self._loss_scale = self._loss_scale/2.
-            self._unskipped = 0
-        else:
-            should_skip = False
-            self._unskipped += 1
-        if self._unskipped == self._scale_seq_len and self.dynamic:
-            self._loss_scale = min(self._max_loss_scale, self._loss_scale*2.)
-            self._unskipped = 0
-        return should_skip

apex/apex/amp/utils.py DELETED Viewed

@@ -1,210 +0,0 @@
-from . import compat
-import functools
-import itertools
-import torch
-def is_cuda_enabled():
-    return torch.version.cuda is not None
-def get_cuda_version():
-    return tuple(int(x) for x in torch.version.cuda.split('.'))
-def is_fp_tensor(x):
-    if is_nested(x):
-        # Fast-fail version of all(is_fp_tensor)
-        for y in x:
-            if not is_fp_tensor(y):
-                return False
-        return True
-    return compat.is_tensor_like(x) and compat.is_floating_point(x)
-def is_nested(x):
-    return isinstance(x, tuple) or isinstance(x, list)
-def should_cache(x):
-    if is_nested(x):
-        # Fast-fail version of all(should_cache)
-        for y in x:
-            if not should_cache(y):
-                return False
-        return True
-    return isinstance(x, torch.nn.parameter.Parameter) and \
-        type_string(x) == 'FloatTensor'
-def collect_fp_tensor_types(args, kwargs):
-    def collect_types(x, types):
-        if is_nested(x):
-            for y in x:
-                collect_types(y, types)
-        else:
-            types.add(type_string(x))
-    all_args = itertools.chain(args, kwargs.values())
-    types = set()
-    for x in all_args:
-        if is_fp_tensor(x):
-            collect_types(x, types)
-    return types
-def type_string(x):
-    return x.type().split('.')[-1]
-def maybe_half(x, name='', verbose=False):
-    if is_nested(x):
-        return type(x)([maybe_half(y) for y in x])
-    if not x.is_cuda or type_string(x) == 'HalfTensor':
-        return x
-    else:
-        if verbose:
-            print('Float->Half ({})'.format(name))
-        return x.half()
-def maybe_float(x, name='', verbose=False):
-    if is_nested(x):
-        return type(x)([maybe_float(y) for y in x])
-    if not x.is_cuda or type_string(x) == 'FloatTensor':
-        return x
-    else:
-        if verbose:
-            print('Half->Float ({})'.format(name))
-        return x.float()
-# NB: returneds casted `args`, mutates `kwargs` in-place
-def casted_args(cast_fn, args, kwargs):
-    new_args = []
-    for x in args:
-        if is_fp_tensor(x):
-            new_args.append(cast_fn(x))
-        else:
-            new_args.append(x)
-    for k in kwargs:
-        val = kwargs[k]
-        if is_fp_tensor(val):
-            kwargs[k] = cast_fn(val)
-    return new_args
-def cached_cast(cast_fn, x, cache):
-    if is_nested(x):
-        return type(x)([cached_cast(y) for y in x])
-    if x in cache:
-        cached_x = cache[x]
-        if x.requires_grad and cached_x.requires_grad:
-            # Make sure x is actually cached_x's autograd parent.
-            if cached_x.grad_fn.next_functions[1][0].variable is not x:
-                raise RuntimeError("x and cache[x] both require grad, but x is not "
-                                   "cache[x]'s parent.  This is likely an error.")
-        # During eval, it's possible to end up caching casted weights with
-        # requires_grad=False.  On the next training iter, if cached_x is found
-        # and reused from the cache, it will not actually have x as its parent.
-        # Therefore, we choose to invalidate the cache (and force refreshing the cast)
-        # if x.requires_grad and cached_x.requires_grad do not match.
-        #
-        # During eval (i.e. running under with torch.no_grad()) the invalidation
-        # check would cause the cached value to be dropped every time, because
-        # cached_x would always be created with requires_grad=False, while x would
-        # still have requires_grad=True.  This would render the cache effectively
-        # useless during eval.  Therefore, if we are running under the no_grad()
-        # context manager (torch.is_grad_enabled=False) we elide the invalidation
-        # check, and use the cached value even though its requires_grad flag doesn't
-        # match.  During eval, we don't care that there's no autograd-graph
-        # connection between x and cached_x.
-        if torch.is_grad_enabled() and x.requires_grad != cached_x.requires_grad:
-            del cache[x]
-        else:
-            return cached_x
-    casted_x = cast_fn(x)
-    cache[x] = casted_x
-    return casted_x
-def verbosify(cast_fn, fn_name, verbose):
-    if verbose:
-        return functools.partial(cast_fn, name=fn_name, verbose=verbose)
-    else:
-        return cast_fn
-def as_inplace(fns):
-    for x in fns:
-        yield x + '_'
-def has_func(mod, fn):
-    if isinstance(mod, dict):
-        return fn in mod
-    else:
-        return hasattr(mod, fn)
-def get_func(mod, fn):
-    if isinstance(mod, dict):
-        return mod[fn]
-    else:
-        return getattr(mod, fn)
-def set_func(mod, fn, new_fn):
-    if isinstance(mod, dict):
-        mod[fn] = new_fn
-    else:
-        setattr(mod, fn, new_fn)
-def set_func_save(handle, mod, fn, new_fn):
-    cur_fn = get_func(mod, fn)
-    handle._save_func(mod, fn, cur_fn)
-    set_func(mod, fn, new_fn)
-# A couple problems get solved here:
-# - The flat_weight buffer is disconnected from autograd graph,
-#   so the fp16 weights need to be derived from the input weights
-#   to this forward call, not the flat buffer.
-# - The ordering of weights in the flat buffer is...idiosyncratic.
-# First problem is solved with combination of set_ (to set up
-# correct storage) and copy_ (so the fp16 weight derives from the
-# fp32 one in autograd.
-# Second is solved by doing ptr arithmetic on the fp32 weights
-# to derive the correct offset.
-#
-# TODO: maybe this should actually use
-# `torch._cudnn_rnn_flatten_weight`? But then I need to call
-# on first iter and cache the right offsets. Ugh.
-def synthesize_flattened_rnn_weights(fp32_weights,
-                                     fp16_flat_tensor,
-                                     rnn_fn='',
-                                     verbose=False):
-    fp16_weights = []
-    fp32_base_ptr = fp32_weights[0][0].data_ptr()
-    for layer_weights in fp32_weights:
-        fp16_layer_weights = []
-        for w_fp32 in layer_weights:
-            w_fp16 = w_fp32.new().half()
-            offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
-            w_fp16.set_(fp16_flat_tensor.storage(),
-                        offset,
-                        w_fp32.shape)
-            w_fp16.copy_(w_fp32)
-            if verbose:
-                print('Float->Half ({})'.format(rnn_fn))
-            fp16_layer_weights.append(w_fp16)
-        fp16_weights.append(fp16_layer_weights)
-    return fp16_weights
-# Roughly same as above, just the `fp32_weights` aren't nested.
-# Code kept separate for readability.
-def new_synthesize_flattened_rnn_weights(fp32_weights,
-                                         fp16_flat_tensor,
-                                         rnn_fn='',
-                                         verbose=False):
-    fp16_weights = []
-    fp32_base_ptr = fp32_weights[0].data_ptr()
-    for w_fp32 in fp32_weights:
-        w_fp16 = w_fp32.new().half()
-        offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
-        w_fp16.set_(fp16_flat_tensor.storage(),
-                    offset,
-                    w_fp32.shape)
-        w_fp16.copy_(w_fp32)
-        if verbose:
-            print('Float->Half ({})'.format(rnn_fn))
-        fp16_weights.append(w_fp16)
-    return fp16_weights

apex/apex/amp/wrap.py DELETED Viewed

@@ -1,276 +0,0 @@
-from . import compat
-from . import utils
-from ._amp_state import _amp_state
-from . import rnn_compat
-import functools
-import torch
-def make_cast_wrapper(orig_fn, cast_fn, handle,
-                      try_caching=False):
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        if not handle.is_active():
-            return orig_fn(*args, **kwargs)
-        if try_caching and handle.has_cache:
-            args = list(args)
-            for i in range(len(args)):
-                if utils.should_cache(args[i]):
-                    args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
-            for k in kwargs:
-                if utils.should_cache(kwargs[k]):
-                    kwargs[k] = utils.cached_cast(cast_fn, kwargs[k], handle.cache)
-        new_args = utils.casted_args(cast_fn,
-                                     args,
-                                     kwargs)
-        return orig_fn(*new_args, **kwargs)
-    return wrapper
-def cached_cast(mod, fn, cast_fn, handle,
-                try_caching=False, verbose=False):
-    if not utils.has_func(mod, fn):
-        return
-    orig_fn = utils.get_func(mod, fn)
-    cast_fn = utils.verbosify(cast_fn, fn, verbose)
-    wrapper = make_cast_wrapper(orig_fn, cast_fn, handle, try_caching)
-    utils.set_func_save(handle, mod, fn, wrapper)
-# `handle` arg is unused, but simplifies API to make `make_cast_wrapper`
-# Annoyingly, make_promote_wrapper still uses the global handle.  Once everyone
-# is on the new API and I am free to get rid of handle, I can clean this up.
-def make_promote_wrapper(orig_fn, cast_fn, handle=None):
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        if not _amp_state.handle.is_active():
-            return orig_fn(*args, **kwargs)
-        types = utils.collect_fp_tensor_types(args, kwargs)
-        if len(types) <= 1:
-            return orig_fn(*args, **kwargs)
-        elif len(types) == 2 and types == set(['HalfTensor', 'FloatTensor']):
-            new_args = utils.casted_args(cast_fn,
-                                         args,
-                                         kwargs)
-            return orig_fn(*new_args, **kwargs)
-        else:
-            raise NotImplementedError('Do not know how to handle ' +
-                                      'these types to promote: {}'
-                                      .format(types))
-    return wrapper
-def promote(mod, fn, handle, verbose=False):
-    orig_fn = utils.get_func(mod, fn)
-    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
-    wrapper = make_promote_wrapper(orig_fn, maybe_float)
-    utils.set_func_save(handle, mod, fn, wrapper)
-def sequence_promote(mod, fn, handle, verbose=False):
-    orig_fn = utils.get_func(mod, fn)
-    maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
-    @functools.wraps(orig_fn)
-    def wrapper(seq, *args, **kwargs):
-        if not _amp_state.handle.is_active():
-            return orig_fn(seq, *args, **kwargs)
-        types = set([utils.type_string(x) for x in seq])
-        if len(types) <= 1:
-            return orig_fn(seq, *args, **kwargs)
-        elif types == set(['HalfTensor', 'FloatTensor']):
-            cast_seq = utils.casted_args(maybe_float,
-                                         seq, {})
-            return orig_fn(cast_seq, *args, **kwargs)
-        else:
-            # TODO: other mixed-type cases aren't due to amp.
-            #       Just pass through?
-            return orig_fn(seq, *args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)
-def promote_match_arg0(mod, fn, handle, verbose=False):
-    if not utils.has_func(mod, fn):
-        return
-    orig_fn = utils.get_func(mod, fn)
-    @functools.wraps(orig_fn)
-    def wrapper(arg0, *args, **kwargs):
-        assert compat.is_tensor_like(arg0)
-        if not _amp_state.handle.is_active():
-            return orig_fn(arg0, *args, **kwargs)
-        if utils.type_string(arg0) == 'HalfTensor':
-            cast_fn = utils.maybe_half
-        elif utils.type_string(arg0) == 'FloatTensor':
-            cast_fn = utils.maybe_float
-        else:
-            return orig_fn(arg0, *args, **kwargs)
-        cast_fn = utils.verbosify(cast_fn, fn, verbose)
-        new_args = utils.casted_args(cast_fn, args, kwargs)
-        return orig_fn(arg0, *new_args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)
-def err_if_any_half(mod, fn, handle, custom_err_msg=None):
-    if not utils.has_func(mod, fn):
-        return
-    orig_fn = utils.get_func(mod, fn)
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        types = utils.collect_fp_tensor_types(args, kwargs)
-        if 'HalfTensor' in types:
-            if custom_err_msg:
-                raise NotImplementedError(custom_err_msg)
-            else:
-                raise NotImplementedError('Cannot call in-place function ' +
-                                          '{} with fp16 arguments.'.format(fn))
-        else:
-            return orig_fn(*args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)
-def err_if_arg0_half(mod, fn, handle, verbose=False):
-    if not utils.has_func(mod, fn):
-        return
-    orig_fn = utils.get_func(mod, fn)
-    @functools.wraps(orig_fn)
-    def wrapper(arg0, *args, **kwargs):
-        assert compat.is_tensor_like(arg0)
-        if utils.type_string(arg0) == 'HalfTensor':
-            raise NotImplementedError('Cannot call in-place method ' +
-                                      '{} on fp16 Tensors.'.format(fn))
-        else:
-            cast_fn = utils.verbosify(utils.maybe_float, fn, verbose)
-            new_args = utils.casted_args(cast_fn, args, kwargs)
-            return orig_fn(arg0, *new_args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)
-# Current RNN approach:
-# - Wrap top-level `RNN` function in thnn backend
-# - Will call into either CudnnRNN or AutogradRNN
-#  - Each of these are factory functions that return a per-iter
-#    `forward` function
-# - We interpose on the factory function to:
-#   1) Interpose on the actual forward function and put in casts
-#   2) Insert an fp16 `flat_weight` if necessary
-def rnn_cast(backend, fn, handle, verbose=False):
-    orig_rnn = utils.get_func(backend, fn)
-    @functools.wraps(orig_rnn)
-    def rnn_wrapper(*args, **kwargs):
-        flat_weight = kwargs.get('flat_weight')
-        if flat_weight is not None:
-            # We replace `flat_weight` with an uninitialized fp16
-            # Tensor. The "actual" weight tensors (provided in `forward`),
-            # will then be set up as ptrs into the buffer and have the
-            # corresponding fp32 values copied in.
-            # We need to call `copy` on the "actual" weights so that the
-            # autograd graph correctly backprops from the wgrads computed
-            # inside cuDNN (on fp16 weights) into the fp32 weights.
-            assert utils.type_string(flat_weight) == 'FloatTensor'
-            if compat.tensor_is_float_tensor() or compat.tensor_is_variable():
-                # Pre-0.4. A little slower, since it zeros out memory.
-                flat_weight_fp16 = flat_weight.new().half().resize_(flat_weight.shape)
-            else:
-                flat_weight_fp16 = torch.empty_like(flat_weight,
-                                                    dtype=torch.float16)
-            kwargs['flat_weight'] = flat_weight_fp16
-        else:
-            flat_weight_fp16 = None
-        forward = orig_rnn(*args, **kwargs)
-        @functools.wraps(forward)
-        def fwd_wrapper(*fargs, **fkwargs):
-            assert len(fargs) == 3 or len(fargs) == 4
-            inputs, weights, hiddens = fargs[:3]
-            assert utils.is_fp_tensor(inputs)
-            assert isinstance(weights, list)
-            cast_fn = utils.verbosify(utils.maybe_half,
-                                      fn,
-                                      verbose)
-            new_args = []
-            # 0) Inputs
-            new_args.append(cast_fn(inputs))
-            # 1) Weights
-            if flat_weight_fp16 is not None:
-                fp16_weights = utils.synthesize_flattened_rnn_weights(
-                    weights, flat_weight_fp16, fn, verbose)
-            else:
-                fp16_weights = [[cast_fn(w) for w in layer]
-                                for layer in weights]
-            new_args.append(fp16_weights)
-            # 2) Inputs: either a tuple (for LSTM) or single tensor
-            if isinstance(hiddens, tuple):
-                new_args.append(tuple(cast_fn(x) for x in hiddens))
-            elif utils.is_fp_tensor(hiddens):
-                new_args.append(cast_fn(hiddens))
-            else:
-                # Hiddens can, in principle, be `None` -- pass through
-                new_args.append(hiddens)
-            # 3) Batch sizes (0.4 or later only)
-            if len(fargs) == 4:
-                new_args.append(fargs[3])
-            return forward(*new_args, **fkwargs)
-        return fwd_wrapper
-    utils.set_func_save(handle, backend, fn, rnn_wrapper)
-def new_rnn_cast(fn, handle, verbose=False):
-    # Forward+backward compatibility around https://github.com/pytorch/pytorch/pull/15744
-    # For rnn backend calls that route through _rnn_impls, we must patch the ref
-    # that _rnn_impls stashed.  For rnn backend calls that directly invoke
-    # _VF.<backend>, e.g. _VF.lstm, we can patch onto VariableFunctionsShim,
-    # which in turn has patched the ref named "_VF" in torch.nn.modules.rnn.
-    if utils.has_func(torch.nn.modules.rnn._rnn_impls, fn):
-        mod = torch.nn.modules.rnn._rnn_impls
-    else:
-        mod = torch.nn.modules.rnn._VF
-        assert isinstance(mod, rnn_compat.VariableFunctionsShim)
-        fn = fn.lower()
-    orig_fn = utils.get_func(mod, fn)
-    cast_fn = utils.verbosify(utils.maybe_half, fn, verbose)
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        # Exact call signature from modules/rnn.py
-        assert len(args) == 9
-        assert len(kwargs) == 0
-        if not _amp_state.handle.is_active():
-            return orig_fn(*args, **kwargs)
-        if isinstance(args[6], bool):
-            params_idx = 2 # Not PackedSequence case
-        else:
-            params_idx = 3 # PackedSequence case
-        new_args = []
-        for i, arg in enumerate(args):
-            if i == params_idx:
-                num_params = sum([x.numel() for x in arg])
-                fp16_weight_buf = args[0].new_empty((num_params,),
-                                                    dtype=torch.half)
-                casted_weights = utils.new_synthesize_flattened_rnn_weights(
-                    arg, fp16_weight_buf, fn, verbose)
-                new_args.append(casted_weights)
-            elif utils.is_fp_tensor(arg):
-                new_args.append(cast_fn(arg))
-            else:
-                new_args.append(arg)
-        return orig_fn(*new_args)
-    utils.set_func_save(handle, mod, fn, wrapper)
-def disable_casts(mod, fn, handle):
-    if not utils.has_func(mod, fn):
-        return
-    orig_fn = utils.get_func(mod, fn)
-    @functools.wraps(orig_fn)
-    def wrapper(*args, **kwargs):
-        with handle._disable_casts():
-            return orig_fn(*args, **kwargs)
-    utils.set_func_save(handle, mod, fn, wrapper)

apex/apex/contrib/__init__.py DELETED Viewed

File without changes

apex/apex/contrib/bottleneck/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .bottleneck import Bottleneck, SpatialBottleneck
2	- from .halo_exchangers import HaloExchangerNoComm, HaloExchangerAllGather, HaloExchangerSendRecv, HaloExchangerPeer

apex/apex/contrib/bottleneck/bottleneck.py DELETED Viewed

@@ -1,749 +0,0 @@
-import functools as func
-import torch
-import torch.distributed as dist
-from torch import nn
-from apex import check_cudnn_version_and_warn
-import fast_bottleneck
-import nccl_p2p_cuda as inc
-assert check_cudnn_version_and_warn(__name__, 8400)
-def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
-    weight_tensor_nchw = tensor
-    nn.init.kaiming_uniform_(weight_tensor_nchw, a=a, mode=mode, nonlinearity=nonlinearity)
-def compute_scale_bias_one(nhwc, weight, bias, running_mean, running_var, w_scale, w_bias):
-    scale = weight * running_var.rsqrt()
-    bias = bias - running_mean * scale
-    w_scale.copy_(scale)
-    w_bias.copy_(bias)
-def compute_scale_bias_method(nhwc, args):
-    for arg in args:
-        # arg is tuple of (weight, bias, running_mean, running_var, w_scale, w_bias)
-        compute_scale_bias_one(nhwc, *arg)
-class FrozenBatchNorm2d(torch.jit.ScriptModule):
-    """
-    BatchNorm2d where the batch statistics and the affine parameters are fixed
-    """
-    def __init__(self, n):
-        super(FrozenBatchNorm2d, self).__init__()
-        self.register_buffer("weight", torch.ones(n))
-        self.register_buffer("bias", torch.zeros(n))
-        self.register_buffer("running_mean", torch.zeros(n))
-        self.register_buffer("running_var", torch.ones(n))
-    @torch.jit.script_method
-    def get_scale_bias(self, nhwc):
-        # type: (bool) -> List[torch.Tensor]
-        scale = self.weight * self.running_var.rsqrt()
-        bias = self.bias - self.running_mean * scale
-        if nhwc:
-            scale = scale.reshape(1, 1, 1, -1)
-            bias = bias.reshape(1, 1, 1, -1)
-        else:
-            scale = scale.reshape(1, -1, 1, 1)
-            bias = bias.reshape(1, -1, 1, 1)
-        return scale, bias
-    @torch.jit.script_method
-    def forward(self, x):
-        scale, bias = self.get_scale_bias(False)
-        return x * scale + bias
-@torch.jit.script
-def drelu_dscale1(grad_o, output, scale1):
-    relu_mask = (output>0)
-    dx_relu = relu_mask * grad_o
-    g1 = dx_relu * scale1
-    return g1, dx_relu
-@torch.jit.script
-def drelu_dscale2(grad_o, output, scale1, scale2):
-    relu_mask = (output>0)
-    dx_relu = relu_mask * grad_o
-    g1 = dx_relu * scale1
-    g2 = dx_relu * scale2
-    return g1, g2
-class BottleneckFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, nhwc, stride_1x1, scale, bias, x, *conv):
-        # TODO: clean up order of tensors
-        args = [x, *conv[0:3], *scale[0:3], *bias[0:3]]
-        ctx.downsample = len(conv) > 3
-        if ctx.downsample:
-            args.append(conv[3])
-            args.append(scale[3])
-            args.append(bias[3])
-        # weight buffers are always in nhwc while shape can be nhwc or channels_last
-        # here we pass in flag and let c++ handle it
-        # alternatively, we can put all sizes into a fixed format and pass it in
-        outputs = fast_bottleneck.forward(nhwc, stride_1x1, args)
-        ctx.save_for_backward(*(args+outputs))
-        # save relu outputs for drelu
-        ctx.nhwc = nhwc
-        ctx.stride_1x1 = stride_1x1
-        return outputs[2]
-    # backward relu is not exposed, MUL with mask used now
-    # only support dgrad
-    @staticmethod
-    def backward(ctx, grad_o):
-        outputs = ctx.saved_tensors[-3:]
-        if ctx.downsample:
-            grad_conv3, grad_conv4 = drelu_dscale2(grad_o, outputs[2], ctx.saved_tensors[6], ctx.saved_tensors[11])
-        else:
-            grad_conv3, grad_conv4 = drelu_dscale1(grad_o, outputs[2], ctx.saved_tensors[6])
-        # create input vector for backward
-        t_list = [*ctx.saved_tensors[0:10]]
-        t_list.append(grad_conv3)
-        t_list.append(grad_conv4)
-        # outputs used for wgrad and generating drelu mask
-        t_list.append(outputs[0])
-        t_list.append(outputs[1])
-        # in case there is downsample
-        if ctx.downsample:
-            t_list.append(ctx.saved_tensors[10])
-        grads = fast_bottleneck.backward(ctx.nhwc, ctx.stride_1x1, t_list)
-        return (None, None, None, None, *grads)
-bottleneck_function = BottleneckFunction.apply
-def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
-    """3x3 convolution with padding"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                     padding=dilation, groups=groups, bias=False, dilation=dilation)
-def conv1x1(in_planes, out_planes, stride=1):
-    """1x1 convolution"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-class Bottleneck(torch.nn.Module):
-    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
-    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
-    # This variant is also known as ResNet V1.5 and improves accuracy according to
-    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
-    # here we put it at 1x1
-    def __init__(self, in_channels, bottleneck_channels, out_channels, stride=1, groups=1,
-                 dilation=1, norm_func=None, use_cudnn=False, explicit_nhwc=False):
-        super(Bottleneck, self).__init__()
-        if groups != 1:
-            raise RuntimeError('Only support groups == 1')
-        if dilation != 1:
-            raise RuntimeError('Only support dilation == 1')
-        if norm_func == None:
-            norm_func = FrozenBatchNorm2d
-        else:
-            raise RuntimeError('Only support frozen BN now.')
-        if stride != 1 or in_channels != out_channels:
-            self.downsample = nn.Sequential(
-                conv1x1(in_channels, out_channels, stride),
-                norm_func(out_channels),
-            )
-        else:
-            self.downsample = None
-        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv1x1(in_channels, bottleneck_channels, stride)
-        self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels)
-        self.conv3 = conv1x1(bottleneck_channels, out_channels)
-        self.relu = nn.ReLU(inplace=True)
-        self.stride = stride
-        self.bn1 = norm_func(bottleneck_channels)
-        self.bn2 = norm_func(bottleneck_channels)
-        self.bn3 = norm_func(out_channels)
-        self.w_scale = None
-        self.use_cudnn = use_cudnn
-        # setup conv weights
-        self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight]
-        if self.downsample is not None:
-            self.w_conv.append(self.downsample[0].weight)
-        # init weight in nchw format before possible transpose
-        for w in self.w_conv:
-            kaiming_uniform_(w, a=1)
-        # TODO: prevent unsupported case usage
-        # support cases
-        #                 native      cudnn
-        # normal             yes         no
-        # channel_last       yes        yes
-        # explicit_nhwc       no        yes
-        self.explicit_nhwc = explicit_nhwc
-        if self.explicit_nhwc:
-            for p in self.parameters():
-                with torch.no_grad():
-                    p.data = p.data.permute(0,2,3,1).contiguous()
-        return
-    # Returns single callable that recomputes scale and bias for all frozen batch-norms.
-    # This method must be called before cuda graphing.
-    # The callable it returns can be called anytime.
-    # Calling this method will prevent these from being computed every forward call.
-    def get_scale_bias_callable(self):
-        self.w_scale, self.w_bias, args = [], [], []
-        batch_norms = [self.bn1, self.bn2, self.bn3]
-        if self.downsample is not None:
-            batch_norms.append(self.downsample[1])
-        for bn in batch_norms:
-            s = torch.empty_like(bn.weight)
-            b = torch.empty_like(s)
-            args.append( (bn.weight, bn.bias, bn.running_mean, bn.running_var, s, b) )
-            if self.explicit_nhwc:
-                self.w_scale.append( s.reshape(1, 1, 1, -1) )
-                self.w_bias.append( b.reshape(1, 1, 1, -1) )
-            else:
-                self.w_scale.append( s.reshape(1, -1, 1, 1) )
-                self.w_bias.append( b.reshape(1, -1, 1, 1) )
-        return func.partial(compute_scale_bias_method, self.explicit_nhwc, args)
-    def forward(self, x):
-        if self.use_cudnn:
-            if self.w_scale is None:
-                # calculate scale/bias from registered buffers
-                # TODO: make this better
-                s1, b1 = self.bn1.get_scale_bias(self.explicit_nhwc)
-                s2, b2 = self.bn2.get_scale_bias(self.explicit_nhwc)
-                s3, b3 = self.bn3.get_scale_bias(self.explicit_nhwc)
-                w_scale = [s1, s2, s3]
-                w_bias = [b1, b2, b3]
-                if self.downsample is not None:
-                    s4, b4 = self.downsample[1].get_scale_bias(self.explicit_nhwc)
-                    w_scale.append(s4)
-                    w_bias.append(b4)
-                out = bottleneck_function(self.explicit_nhwc, self.stride, w_scale, w_bias, x, *self.w_conv)
-            else:
-                out = bottleneck_function(self.explicit_nhwc, self.stride, self.w_scale, self.w_bias, x, *self.w_conv)
-            return out
-        if self.explicit_nhwc:
-            raise RuntimeError('explicit nhwc with native ops is not supported.')
-        # fallback to native ops
-        identity = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        if self.downsample is not None:
-            identity = self.downsample(x)
-        out += identity
-        out = self.relu(out)
-        return out
-class SpatialBottleneckFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, spatial_group_size, spatial_group_rank, spatial_communicator, spatial_halo_exchanger, spatial_method, use_delay_kernel, explicit_nhwc, stride_1x1, scale, bias, thresholdTop, thresholdBottom, x, *conv):
-        if spatial_group_size > 1:
-            stream1 = spatial_halo_exchanger.stream1
-            stream2 = spatial_halo_exchanger.stream2
-            stream3 = spatial_halo_exchanger.stream3
-        # TODO: clean up order of tensors
-        args = [x, *conv[0:3], *scale[0:3], *bias[0:3]]
-        ctx.downsample = len(conv) > 3
-        if ctx.downsample:
-            args.append(conv[3])
-            args.append(scale[3])
-            args.append(bias[3])
-        # weight buffers are always in explicit_nhwc while shape can be explicit_nhwc or channels_last
-        # here we pass in flag and let c++ handle it
-        # alternatively, we can put all sizes into a fixed format and pass it in
-        outputs = fast_bottleneck.forward_init(explicit_nhwc, stride_1x1, args)
-        fast_bottleneck.forward_out1(explicit_nhwc, stride_1x1, args, outputs)
-        if spatial_group_size > 1:
-            out1 = outputs[0]
-            if explicit_nhwc:
-                N,Hs,W,C = list(out1.shape)
-                memory_format = torch.contiguous_format
-                out1_pad = torch.empty([N,Hs+2,W,C], dtype=out1.dtype, device='cuda')
-            else:
-                N,C,Hs,W = list(out1.shape)
-                memory_format = torch.channels_last if out1.is_contiguous(memory_format=torch.channels_last) else torch.contiguous_format
-                out1_pad = torch.empty([N,C,Hs+2,W], dtype=out1.dtype, device='cuda', memory_format=memory_format)
-            stream1.wait_stream(torch.cuda.current_stream())
-            if spatial_method != 2: stream3.wait_stream(torch.cuda.current_stream())
-            with torch.cuda.stream(stream1):
-                if explicit_nhwc:
-                    top_out1_halo = out1_pad[:,:1,:,:]
-                    btm_out1_halo = out1_pad[:,Hs+1:Hs+2,:,:]
-                    spatial_halo_exchanger.left_right_halo_exchange(out1[:,:1,:,:], out1[:,Hs-1:,:,:], top_out1_halo, btm_out1_halo)
-                else:
-                    top_out1_halo = out1_pad[:,:,:1,:]
-                    btm_out1_halo = out1_pad[:,:,Hs+1:Hs+2,:]
-                    spatial_halo_exchanger.left_right_halo_exchange(out1[:,:,:1,:], out1[:,:,Hs-1:,:], top_out1_halo, btm_out1_halo)
-            if spatial_method == 1:
-                # overlap mid convolution with halo transfer
-                if spatial_group_rank < spatial_group_size-1:
-                    stream2.wait_stream(stream1)
-                    with torch.cuda.stream(stream2):
-                        if explicit_nhwc:
-                            btm_fat_halo = torch.empty((N,3,W,C),dtype=out1.dtype,device=out1.device)
-                            btm_fat_halo[:,0:2,:,:].copy_(out1[:,Hs-2:,:,:])
-                            btm_fat_halo[:,2:,:,:].copy_(btm_out1_halo)
-                        else:
-                            btm_fat_halo = torch.empty((N,C,3,W),dtype=out1.dtype,device=out1.device)
-                            btm_fat_halo[:,:,0:2,:].copy_(out1[:,:,Hs-2:,:])
-                            btm_fat_halo[:,:,2:,:].copy_(btm_out1_halo)
-                        btm_out2 = fast_bottleneck.forward_out2_halo(explicit_nhwc, btm_fat_halo, args)
-                if spatial_group_rank > 0:
-                    with torch.cuda.stream(stream1):
-                        if explicit_nhwc:
-                            top_fat_halo = torch.empty((N,3,W,C),dtype=out1.dtype,device=out1.device)
-                            top_fat_halo[:,:1,:,:].copy_(top_out1_halo)
-                            top_fat_halo[:,1:3,:,:].copy_(out1[:,:2,:,:])
-                        else:
-                            top_fat_halo = torch.empty((N,C,3,W),dtype=out1.dtype,device=out1.device)
-                            top_fat_halo[:,:,:1,:].copy_(top_out1_halo)
-                            top_fat_halo[:,:,1:3,:].copy_(out1[:,:,:2,:])
-                        top_out2 = fast_bottleneck.forward_out2_halo(explicit_nhwc, top_fat_halo, args)
-                if use_delay_kernel: inc.add_delay(10)
-            elif spatial_method != 2 and spatial_method != 3:
-                assert(False), "spatial_method must be 1, 2 or 3"
-        if spatial_group_size <= 1:
-            fast_bottleneck.forward_out2(explicit_nhwc, stride_1x1, args, outputs)
-        elif spatial_method == 1:
-            fast_bottleneck.forward_out2(explicit_nhwc, stride_1x1, args, outputs)
-            with torch.cuda.stream(stream3):
-                if explicit_nhwc:
-                    out1_pad[:,1:Hs+1,:,:].copy_(out1)
-                else:
-                    out1_pad[:,:,1:Hs+1,:].copy_(out1)
-        elif spatial_method == 2:
-            # wait for halo transfer to finish before doing a full convolution of padded x
-            if explicit_nhwc:
-                out1_pad[:,1:Hs+1,:,:].copy_(out1)
-            else:
-                out1_pad[:,:,1:Hs+1,:].copy_(out1)
-            torch.cuda.current_stream().wait_stream(stream1)
-            fast_bottleneck.forward_out2_pad(explicit_nhwc, stride_1x1, args, outputs, out1_pad)
-        elif spatial_method == 3:
-            fast_bottleneck.forward_out2_mask(explicit_nhwc, stride_1x1, args, outputs, thresholdTop, thresholdBottom)
-            with torch.cuda.stream(stream3):
-                if explicit_nhwc:
-                    out1_pad[:,1:Hs+1,:,:].copy_(out1)
-                else:
-                    out1_pad[:,:,1:Hs+1,:].copy_(out1)
-        # compute halo cells for outputs[1] (out2)
-        if spatial_group_size > 1:
-            out2 = outputs[1]
-            if explicit_nhwc:
-                top_out2_halo = out2[:,:1,:,:]
-                btm_out2_halo = out2[:,Hs-1:,:,:]
-            else:
-                top_out2_halo = out2[:,:,:1,:]
-                btm_out2_halo = out2[:,:,Hs-1:,:]
-            if spatial_method == 1:
-                if spatial_group_rank > 0:
-                    torch.cuda.current_stream().wait_stream(stream1)
-                    top_out2_halo.copy_(top_out2)
-                if spatial_group_rank < spatial_group_size-1:
-                    torch.cuda.current_stream().wait_stream(stream2)
-                    btm_out2_halo.copy_(btm_out2)
-            elif spatial_method == 3:
-                # Note
-                # out2 halo correction cannot overlap with anything since it has
-                # to wait for out2_mask to finish, but itself has to finish before
-                # the first kernel of _forward_rest can launch.
-                # At least we can overlap the two halo correction kernels.
-                if spatial_group_rank < spatial_group_size-1:
-                    stream2.wait_stream(stream1) # wait for halo transfers to finish
-                    stream2.wait_stream(torch.cuda.current_stream()) # wait for *_out2_mask to finish
-                    with torch.cuda.stream(stream2):
-                        w1by3 = args[2][:,2:3,:,:].clone()
-                        btm_out1_halo = btm_out1_halo.clone()
-                        btm_out2 = fast_bottleneck.forward_out2_halo_corr(explicit_nhwc, btm_out1_halo, args, w1by3, btm_out2_halo.clone())
-                        btm_out2_halo.copy_(btm_out2)
-                if spatial_group_rank > 0:
-                    stream1.wait_stream(torch.cuda.current_stream()) # wait for *_out2_mask to finish
-                    with torch.cuda.stream(stream1):
-                        w1by3 = args[2][:,:1,:,:].clone()
-                        top_out1_halo = top_out1_halo.clone()
-                        top_out2 = fast_bottleneck.forward_out2_halo_corr(explicit_nhwc, top_out1_halo, args, w1by3, top_out2_halo.clone())
-                        top_out2_halo.copy_(top_out2)
-                if spatial_group_rank < spatial_group_size-1:
-                    torch.cuda.current_stream().wait_stream(stream2)
-                if spatial_group_rank > 0:
-                    torch.cuda.current_stream().wait_stream(stream1)
-        fast_bottleneck.forward_rest(explicit_nhwc, stride_1x1, args, outputs)
-        # save halos for backward pass
-        if spatial_group_size > 1:
-            if spatial_method != 2:
-                # make sure copy of mid-section of out1 into out1_pad is done before exiting
-                torch.cuda.current_stream().wait_stream(stream3)
-            ctx.save_for_backward(*(args+outputs+[out1_pad,]))
-        else:
-            ctx.save_for_backward(*(args+outputs))
-        # save relu outputs for drelu
-        ctx.explicit_nhwc = explicit_nhwc
-        ctx.stride_1x1 = stride_1x1
-        ctx.spatial_group_size = spatial_group_size
-        if spatial_group_size > 1:
-            ctx.spatial_group_rank = spatial_group_rank
-            ctx.spatial_halo_exchanger = spatial_halo_exchanger
-            ctx.spatial_method = spatial_method
-            ctx.use_delay_kernel = use_delay_kernel
-            ctx.thresholdTop = thresholdTop
-            ctx.thresholdBottom = thresholdBottom
-            ctx.stream1 = stream1
-            ctx.stream2 = stream2
-            ctx.stream3 = stream3
-        return outputs[2]
-    # backward relu is not exposed, MUL with mask used now
-    # only support dgrad
-    @staticmethod
-    def backward(ctx, grad_o):
-        if ctx.spatial_group_size > 1:
-            out1_pad = ctx.saved_tensors[-1]
-            outputs = ctx.saved_tensors[-4:-1]
-        else:
-            outputs = ctx.saved_tensors[-3:]
-        if ctx.downsample:
-            grad_conv3, grad_conv4 = drelu_dscale2(grad_o, outputs[2], ctx.saved_tensors[6], ctx.saved_tensors[11])
-        else:
-            grad_conv3, grad_conv4 = drelu_dscale1(grad_o, outputs[2], ctx.saved_tensors[6])
-        # create input vector for backward
-        t_list = [*ctx.saved_tensors[0:10]]
-        t_list.append(grad_conv3)
-        t_list.append(grad_conv4)
-        # outputs used for wgrad and generating drelu mask
-        t_list.append(outputs[0])
-        t_list.append(outputs[1])
-        # in case there is downsample
-        if ctx.downsample:
-            t_list.append(ctx.saved_tensors[10])
-        grads = fast_bottleneck.backward_init(ctx.explicit_nhwc, ctx.stride_1x1, t_list)
-        wgrad3_stream = torch.cuda.Stream()
-        wgrad3_stream.wait_stream(torch.cuda.current_stream())
-        grad_out2 = fast_bottleneck.backward_grad_out2(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads)
-        wgrad2_stream = torch.cuda.Stream()
-        wgrad2_stream.wait_stream(torch.cuda.current_stream())
-        # do halo exchange of grad_out2 here
-        # compute halo cells for grad_out1
-        if ctx.spatial_group_size > 1:
-            if ctx.explicit_nhwc:
-                N,Hs,W,C = list(grad_out2.shape)
-            else:
-                N,C,Hs,W = list(grad_out2.shape)
-            relu1 = t_list[12]
-            ctx.stream1.wait_stream(torch.cuda.current_stream())
-            with torch.cuda.stream(ctx.stream1):
-                top_halo, btm_halo = ctx.spatial_halo_exchanger.left_right_halo_exchange(grad_out2[:,:1,:,:], grad_out2[:,Hs-1:,:,:])
-                # copy halos to send buffer
-            if ctx.spatial_method == 1 or ctx.spatial_method == 2:
-                # 1 -> halo recompute approach
-                # 2 -> wait for concatenated halos, then do single conv on full input (not implemented yet for bprop)
-                if ctx.spatial_group_rank < ctx.spatial_group_size-1:
-                    ctx.stream2.wait_stream(ctx.stream1)
-                    with torch.cuda.stream(ctx.stream2):
-                        if ctx.explicit_nhwc:
-                            btm_fat_halo = torch.empty((N,3,W,C),dtype=grad_out2.dtype,device=grad_out2.device)
-                            btm_fat_halo[:,:2,:,:].copy_(grad_out2[:,Hs-2:,:,:])
-                            btm_fat_halo[:,2:,:,:].copy_(btm_halo)
-                            btm_fat_relu_halo = torch.empty((N,3,W,C),dtype=grad_out2.dtype,device=grad_out2.device)
-                            btm_fat_relu_halo[:,:2,:,:].copy_(relu1[:,Hs-2:,:,:])
-                            btm_fat_relu_halo[:,2:,:,:].zero_()
-                        else:
-                            btm_fat_halo = torch.empty((N,C,3,W),dtype=grad_out2.dtype,device=grad_out2.device)
-                            btm_fat_halo[:,:,:2,:].copy_(grad_out2[:,:,Hs-2:,:])
-                            btm_fat_halo[:,:,2:,:].copy_(btm_halo)
-                            btm_fat_relu_halo = torch.empty((N,C,3,W),dtype=grad_out2.dtype,device=grad_out2.device)
-                            btm_fat_relu_halo[:,:,:2,:].copy_(relu1[:,:,Hs-2:,:])
-                            btm_fat_relu_halo[:,:,2:,:].zero_()
-                        btm_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, btm_fat_halo, btm_fat_relu_halo)
-                        if ctx.explicit_nhwc:
-                            btm_grad_out1_halo = btm_grad_out1_halo[:,1:2,:,:]
-                        else:
-                            btm_grad_out1_halo = btm_grad_out1_halo[:,:,1:2,:]
-                if ctx.spatial_group_rank > 0:
-                    with torch.cuda.stream(ctx.stream1):
-                        if ctx.explicit_nhwc:
-                            top_fat_halo = torch.empty((N,3,W,C),dtype=grad_out2.dtype,device=grad_out2.device)
-                            top_fat_halo[:,:1,:,:].copy_(top_halo)
-                            top_fat_halo[:,1:,:,:].copy_(grad_out2[:,:2,:,:])
-                            top_fat_relu_halo = torch.empty((N,3,W,C),dtype=grad_out2.dtype,device=grad_out2.device)
-                            top_fat_relu_halo[:,:1,:,:].zero_()
-                            top_fat_relu_halo[:,1:,:,:].copy_(relu1[:,:2,:,:])
-                        else:
-                            top_fat_halo = torch.empty((N,C,3,W),dtype=grad_out2.dtype,device=grad_out2.device)
-                            top_fat_halo[:,:,:1,:].copy_(top_halo)
-                            top_fat_halo[:,:,1:,:].copy_(grad_out2[:,:,:2,:])
-                            top_fat_relu_halo = torch.empty((N,C,3,W),dtype=grad_out2.dtype,device=grad_out2.device)
-                            top_fat_relu_halo[:,:,:1,:].zero_()
-                            top_fat_relu_halo[:,:,1:,:].copy_(relu1[:,:,:2,:])
-                        top_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, top_fat_halo, top_fat_relu_halo)
-                        if ctx.explicit_nhwc:
-                            top_grad_out1_halo = top_grad_out1_halo[:,1:2,:,:]
-                        else:
-                            top_grad_out1_halo = top_grad_out1_halo[:,:,1:2,:]
-                if ctx.use_delay_kernel: inc.add_delay(10)
-            elif ctx.spatial_method != 3:
-                assert(False), "spatial_method must be 1, 2 or 3"
-        # compute grad_out1 for internal cells
-        if ctx.spatial_group_size <= 1 or ctx.spatial_method == 1 or ctx.spatial_method == 2:
-            grad_out1 = fast_bottleneck.backward_grad_out1(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, grad_out2)
-        elif ctx.spatial_group_size > 1 and ctx.spatial_method == 3:
-            grad_out1 = fast_bottleneck.backward_grad_out1_mask(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, grad_out2, ctx.thresholdTop, ctx.thresholdBottom)
-        # apply halo cells to grad_out1
-        if ctx.spatial_group_size > 1:
-            w = t_list[2]
-            z = t_list[4]
-            relu1 = t_list[12]
-            #print("w.shape = %s, z.shape = %s, relu1.shape = %s" % (str(list(w.shape)), str(list(z.shape)), str(list(relu1.shape))))
-            if ctx.spatial_method == 1 or ctx.spatial_method == 2:
-                if ctx.spatial_group_rank < ctx.spatial_group_size-1:
-                    torch.cuda.current_stream().wait_stream(ctx.stream2)
-                    if ctx.explicit_nhwc:
-                        grad_out1[:,Hs-1:,:,:].copy_(btm_grad_out1_halo)
-                    else:
-                        grad_out1[:,:,Hs-1:,:].copy_(btm_grad_out1_halo)
-                    #print("ctx.spatial_group_rank = %d, apply grad_out1 btm halo (grad_out1.shape = %s)" % (ctx.spatial_group_rank, str(list(grad_out1.shape))))
-                if ctx.spatial_group_rank > 0:
-                    torch.cuda.current_stream().wait_stream(ctx.stream1)
-                    if ctx.explicit_nhwc:
-                        grad_out1[:,:1,:,:].copy_(top_grad_out1_halo)
-                    else:
-                        grad_out1[:,:,:1,:].copy_(top_grad_out1_halo)
-                    #print("ctx.spatial_group_rank = %d, apply grad_out1 top halo (grad_out1.shape = %s)" % (ctx.spatial_group_rank, str(list(grad_out1.shape))))
-            elif ctx.spatial_method == 3:
-                if ctx.spatial_group_rank < ctx.spatial_group_size-1:
-                    if ctx.explicit_nhwc:
-                        btm_relu_halo = relu1[:,Hs-1:,:,:].clone()
-                        btm_grad_out1 = grad_out1[:,Hs-1:,:,:]
-                    else:
-                        btm_relu_halo = relu1[:,:,Hs-1:,:].clone()
-                        btm_grad_out1 = grad_out1[:,:,Hs-1:,:]
-                    w1by3 = w[:,:1,:,:].clone()
-                    ctx.stream2.wait_stream(ctx.stream1) # wait for halo transfers to finish
-                    ctx.stream2.wait_stream(torch.cuda.current_stream()) # wait for backward_grad_out1_mask to finish before launching halo correction kernel
-                    with torch.cuda.stream(ctx.stream2):
-                        btm_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo_corr(ctx.explicit_nhwc, ctx.stride_1x1, t_list, w1by3, grads, btm_halo, btm_relu_halo, btm_grad_out1.clone())
-                        btm_grad_out1.copy_(btm_grad_out1_halo)
-                if ctx.spatial_group_rank > 0:
-                    if ctx.explicit_nhwc:
-                        top_relu_halo = relu1[:,:1,:,:].clone()
-                        top_grad_out1 = grad_out1[:,:1,:,:]
-                    else:
-                        top_relu_halo = relu1[:,:,:1,:].clone()
-                        top_grad_out1 = grad_out1[:,:,:1,:]
-                    w1by3 = w[:,2:,:,:].clone()
-                    ctx.stream1.wait_stream(torch.cuda.current_stream()) # wait for backward_grad_out1_mask to finish before launching halo correction kernel
-                    with torch.cuda.stream(ctx.stream1):
-                        top_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo_corr(ctx.explicit_nhwc, ctx.stride_1x1, t_list, w1by3, grads, top_halo, top_relu_halo, top_grad_out1.clone())
-                        top_grad_out1.copy_(top_grad_out1_halo)
-                if ctx.spatial_group_rank < ctx.spatial_group_size-1:
-                    torch.cuda.current_stream().wait_stream(ctx.stream2) # wait for halo correction to finish
-                if ctx.spatial_group_rank > 0:
-                    torch.cuda.current_stream().wait_stream(ctx.stream1)
-        wgrad1_stream = torch.cuda.Stream()
-        wgrad1_stream.wait_stream(torch.cuda.current_stream())
-        fast_bottleneck.backward_rest(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, grad_out2, grad_out1)
-        with torch.cuda.stream(wgrad3_stream):
-            fast_bottleneck.backward_wgrad3(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads)
-        with torch.cuda.stream(wgrad2_stream):
-            if ctx.spatial_group_size > 1:
-                fast_bottleneck.backward_wgrad2_pad(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, out1_pad, grad_out2)
-            else:
-                fast_bottleneck.backward_wgrad2(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, grad_out2)
-        with torch.cuda.stream(wgrad1_stream):
-            fast_bottleneck.backward_wgrad1(ctx.explicit_nhwc, ctx.stride_1x1, t_list, grads, grad_out1)
-        torch.cuda.current_stream().wait_stream(wgrad3_stream)
-        torch.cuda.current_stream().wait_stream(wgrad2_stream)
-        torch.cuda.current_stream().wait_stream(wgrad1_stream)
-        return (None, None, None, None, None, None, None, None, None, None, None, None, *grads)
-spatial_bottleneck_function = SpatialBottleneckFunction.apply
-class SpatialBottleneck(torch.nn.Module):
-    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
-    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
-    # This variant is also known as ResNet V1.5 and improves accuracy according to
-    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
-    # here we put it at 1x1
-    def __init__(self, in_channels, bottleneck_channels, out_channels, stride=1, groups=1,
-                 dilation=1, norm_func=None, use_cudnn=False, explicit_nhwc=False,
-                 spatial_parallel_args=None):
-        super(SpatialBottleneck, self).__init__()
-        if groups != 1:
-            raise RuntimeError('Only support groups == 1')
-        if dilation != 1:
-            raise RuntimeError('Only support dilation == 1')
-        if norm_func == None:
-            norm_func = FrozenBatchNorm2d
-        else:
-            raise RuntimeError('Only support frozen BN now.')
-        if stride != 1 or in_channels != out_channels:
-            self.downsample = nn.Sequential(
-                conv1x1(in_channels, out_channels, stride),
-                norm_func(out_channels),
-            )
-        else:
-            self.downsample = None
-        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv1x1(in_channels, bottleneck_channels, stride)
-        self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels)
-        self.conv3 = conv1x1(bottleneck_channels, out_channels)
-        self.relu = nn.ReLU(inplace=True)
-        self.stride = stride
-        self.bn1 = norm_func(bottleneck_channels)
-        self.bn2 = norm_func(bottleneck_channels)
-        self.bn3 = norm_func(out_channels)
-        self.w_scale = None
-        self.use_cudnn = use_cudnn
-        # setup conv weights
-        self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight]
-        if self.downsample is not None:
-            self.w_conv.append(self.downsample[0].weight)
-        # init weight in nchw format before possible transpose
-        for w in self.w_conv:
-            kaiming_uniform_(w, a=1)
-        self.thresholdTop, self.thresholdBottom = None, None
-        # TODO: prevent unsupported case usage
-        # support cases
-        #                 native      cudnn
-        # normal             yes         no
-        # channel_last       yes        yes
-        # explicit_nhwc       no        yes
-        self.explicit_nhwc = explicit_nhwc
-        if self.explicit_nhwc:
-            for p in self.parameters():
-                with torch.no_grad():
-                    p.data = p.data.permute(0,2,3,1).contiguous()
-        # spatial communicator
-        if spatial_parallel_args is None:
-            self.spatial_parallel_args = (1, 0, None, None, 0, False)
-        else:
-            self.spatial_parallel_args = spatial_parallel_args
-        return
-    # Returns single callable that recomputes scale and bias for all frozen batch-norms.
-    # This method must be called before cuda graphing.
-    # The callable it returns can be called anytime.
-    # Calling this method will prevent these from being computed every forward call.
-    def get_scale_bias_callable(self):
-        self.w_scale, self.w_bias, args = [], [], []
-        batch_norms = [self.bn1, self.bn2, self.bn3]
-        if self.downsample is not None:
-            batch_norms.append(self.downsample[1])
-        for bn in batch_norms:
-            s = torch.empty_like(bn.weight)
-            b = torch.empty_like(s)
-            args.append( (bn.weight, bn.bias, bn.running_mean, bn.running_var, s, b) )
-            if self.explicit_nhwc:
-                self.w_scale.append( s.reshape(1, 1, 1, -1) )
-                self.w_bias.append( b.reshape(1, 1, 1, -1) )
-            else:
-                self.w_scale.append( s.reshape(1, -1, 1, 1) )
-                self.w_bias.append( b.reshape(1, -1, 1, 1) )
-        return func.partial(compute_scale_bias_method, self.explicit_nhwc, args)
-    def forward(self, x):
-        if self.use_cudnn:
-            if self.thresholdTop is None:
-                spatial_group_size, spatial_group_rank, _, _, _, _ = self.spatial_parallel_args
-                if self.explicit_nhwc:
-                    N,H,W,C = list(x.shape)
-                else:
-                    N,C,H,W = list(x.shape)
-                self.thresholdTop = torch.tensor([1 if spatial_group_rank > 0 else 0], dtype=torch.int32, device='cuda')
-                self.thresholdBottom = torch.tensor([H-2 if spatial_group_rank < spatial_group_size - 1 else H-1], dtype=torch.int32, device='cuda')
-            if self.w_scale is None:
-                # calculate scale/bias from registered buffers
-                # TODO: make this better
-                s1, b1 = self.bn1.get_scale_bias(self.explicit_nhwc)
-                s2, b2 = self.bn2.get_scale_bias(self.explicit_nhwc)
-                s3, b3 = self.bn3.get_scale_bias(self.explicit_nhwc)
-                w_scale = [s1, s2, s3]
-                w_bias = [b1, b2, b3]
-                if self.downsample is not None:
-                    s4, b4 = self.downsample[1].get_scale_bias(self.explicit_nhwc)
-                    w_scale.append(s4)
-                    w_bias.append(b4)
-                out = spatial_bottleneck_function(*self.spatial_parallel_args, self.explicit_nhwc, self.stride, w_scale, w_bias, self.thresholdTop, self.thresholdBottom, x, *self.w_conv)
-            else:
-                out = spatial_bottleneck_function(*self.spatial_parallel_args, self.explicit_nhwc, self.stride, self.w_scale, self.w_bias, self.thresholdTop, self.thresholdBottom, x, *self.w_conv)
-            return out
-        if self.explicit_nhwc:
-            raise RuntimeError('explicit nhwc with native ops is not supported.')
-        # fallback to native ops
-        identity = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        if self.downsample is not None:
-            identity = self.downsample(x)
-        out += identity
-        out = self.relu(out)
-        return out

apex/apex/contrib/bottleneck/halo_exchangers.py DELETED Viewed

@@ -1,180 +0,0 @@
-import torch
-import torch.distributed as dist
-from torch import nn
-import nccl_p2p_cuda as inc
-import peer_memory_cuda as pm
-# Communication free halo exchanger.
-# NB! This halo exchanger does not exchange halos with neighbors as it should, it merely swaps the inputs
-# NB! This is only useful for performance testing.
-# NB! Do not use for actual production runs
-class HaloExchanger(object):
-    def __init__(self, ranks, rank_in_group):
-        self.stream1 = torch.cuda.Stream()
-        self.stream2 = torch.cuda.Stream()
-        self.stream3 = torch.cuda.Stream()
-        self.group_size = len(ranks)
-        self.ranks = ranks
-        self.rank_in_group = rank_in_group
-        self.wrap_around_left_rank_in_group = (rank_in_group + self.group_size - 1) % self.group_size
-        self.wrap_around_right_rank_in_group = (rank_in_group + 1) % self.group_size
-        self.left_rank = ranks[rank_in_group-1] if rank_in_group > 0 else -1
-        self.left_zero = True if rank_in_group == 0 else False
-        self.right_rank = ranks[rank_in_group+1] if rank_in_group < self.group_size - 1 else -1
-        self.right_zero = True if rank_in_group == self.group_size - 1 else False
-class HaloExchangerNoComm(HaloExchanger):
-    def __init__(self, ranks, rank_in_group):
-        super(HaloExchangerNoComm, self).__init__(ranks, rank_in_group)
-    def left_right_halo_exchange(self, left_output_halo, right_output_halo, left_input_halo=None, right_input_halo=None):
-        if left_input_halo is None:
-            return right_output_halo, left_output_halo
-        else:
-            left_input_halo.copy_(right_output_halo)
-            right_input_halo.copy_(left_output_halo)
-class HaloExchangerAllGather(HaloExchanger):
-    def __init__(self, ranks, rank_in_group, comm):
-        super(HaloExchangerAllGather, self).__init__(ranks, rank_in_group)
-        # self.comm must be NCCL process_group created with torch.distributed.new_group(ranks=ranks)
-        self.comm = comm
-    def left_right_halo_exchange(self, left_output_halo, right_output_halo, left_input_halo=None, right_input_halo=None):
-        N,Hh,W,C = list(left_output_halo.shape)
-        send_halos = torch.empty((N,2*Hh,W,C),dtype=left_output_halo.dtype,device=left_output_halo.device)
-        send_halos[:,:Hh,:,:].copy_(left_output_halo)
-        send_halos[:,Hh:,:,:].copy_(right_output_halo)
-        all_halos = torch.empty((N,2*Hh*self.group_size,W,C),dtype=left_output_halo.dtype,device=left_output_halo.device)
-        all_halos = [all_halos[:,i*2*Hh:(i+1)*2*Hh,:,:] for i in range(self.group_size)]
-        torch.distributed.all_gather(all_halos,send_halos,group=self.comm,no_copy=True)
-        ag_left_input_halo = all_halos[self.wrap_around_left_rank_in_group][:,Hh:,:,:]
-        ag_right_input_halo = all_halos[self.wrap_around_right_rank_in_group][:,:Hh,:,:]
-        if left_input_halo is None:
-            if self.left_zero:
-                ag_left_input_halo.zero_()
-            if self.right_zero:
-                ag_right_input_halo.zero_()
-            return ag_left_input_halo, ag_right_input_halo
-        else:
-            if self.left_zero:
-                left_input_halo.zero_()
-            else:
-                left_input_halo.copy_(ag_left_input_halo)
-            if self.right_zero:
-                right_input_halo.zero_()
-            else:
-                right_input_halo.copy_(ag_right_input_halo)
-class HaloExchangerSendRecv(HaloExchanger):
-    def __init__(self, ranks, rank_in_group):
-        super(HaloExchangerSendRecv, self).__init__(ranks, rank_in_group)
-        nccl_id = inc.get_unique_nccl_id(1).cuda()
-        torch.distributed.broadcast(nccl_id, 0)
-        nccl_id = nccl_id.cpu()
-        print("%d :: nccl_id = %s" % (torch.distributed.get_rank(), str(nccl_id)))
-        # Create another global nccl communicator in addition to the one created by torch.distributed.init_process_group("nccl")
-        # This is unavoidable because the underlying NCCL communicator torch.distributed creates is a protected variable, hence
-        # it cannot be accessed from another class.
-        # TODO: Figure out a way to avoid creating a second global communicator
-        assert(torch.distributed.get_rank() == self.ranks[self.rank_in_group]), "ranks[%d](%d) != torch.distributed.get_rank()(%d)" % (self.rank_in_group, self.ranks[self.rank_in_group], torch.distributed.get_rank())
-        self.handle = inc.init_nccl_comm(nccl_id, torch.distributed.get_rank(), torch.distributed.get_world_size())
-    def left_right_halo_exchange(self, left_output_halo, right_output_halo, left_input_halo=None, right_input_halo=None):
-        if left_input_halo is None:
-            left_input_halo, right_input_halo = inc.left_right_halo_exchange(self.handle, self.left_rank, self.right_rank , left_output_halo, right_output_halo)
-            return left_input_halo, right_input_halo
-        else:
-            inc.left_right_halo_exchange_inplace(self.handle, self.left_rank, self.right_rank, left_output_halo, right_output_halo, left_input_halo, right_input_halo)
-class HaloExchangerPeer(HaloExchanger):
-    def __init__(self, ranks, rank_in_group, peer_pool, explicit_nhwc, numSM=0):
-        super(HaloExchangerPeer, self).__init__(ranks, rank_in_group)
-        self.diagnostics = False
-        self.explicit_nhwc = explicit_nhwc
-        self.numSM = numSM
-        self.peer_pool = peer_pool
-    def _allocate_peer_tensor(self, halo):
-        # Compute size in bytes
-        # Note: Pad buffer so each CUDA block gets required buffer size
-        size = 4 * halo.numel() * halo.element_size()
-        size_per_block = 128 * 2 * 16   # 128 threads each require two 128b buffers
-        size = (size + size_per_block - 1) // size_per_block * size_per_block
-        # Construct dtype peer buffer with desired size
-        shape = [1, 1, 1, size // halo.element_size()]
-        return self.peer_pool.allocate_peer_tensors(shape, halo.dtype, False, True)
-    def left_right_halo_exchange(self, left_output_halo, right_output_halo, left_input_halo=None, right_input_halo=None):
-        inplace = False if left_input_halo is None and right_input_halo is None else True
-        if not inplace:
-            left_input_halo = torch.empty_like(right_output_halo)
-            right_input_halo = torch.empty_like(left_output_halo)
-        channels_last = left_output_halo.is_contiguous(memory_format=torch.channels_last) and not self.explicit_nhwc
-        left_tx = self._allocate_peer_tensor(left_input_halo)
-        right_tx = self._allocate_peer_tensor(right_input_halo)
-        pm.push_pull_halos_1d(
-                self.diagnostics, self.explicit_nhwc, self.numSM, self.rank_in_group,
-                self.left_zero, left_output_halo,  left_tx[self.rank_in_group],  right_tx[self.wrap_around_left_rank_in_group], left_input_halo,
-                self.right_zero, right_output_halo, right_tx[self.rank_in_group], left_tx[self.wrap_around_right_rank_in_group],  right_input_halo,
-                )
-        if not inplace:
-            return left_input_halo, right_input_halo
-# Class that combines input volume with halos from neighbors (1d).
-class HaloPadder:
-    def __init__(self, halo_ex):
-        self.halo_ex = halo_ex
-        self.stream1 = torch.cuda.Stream()
-        self.stream2 = torch.cuda.Stream()
-    def __call__(self, y, half_halo, explicit_nhwc, H_split):
-        channels_last = not explicit_nhwc and y.is_contiguous(memory_format=torch.channels_last)
-        if explicit_nhwc:
-            N,H,W,C = list(y.shape)
-            if H_split:
-                padded_shape = [N,H+2*half_halo,W,C]
-                ypad = torch.empty(shape=padded_shape, dtype=y.dtype, device=y.device, memory_format=torch.contiguous_format)
-                yleft = ypad[:,:half_halo,:,:]
-                ymid = ypad[:,half_halo:H+half_halo,:,:]
-                yright = ypad[:,H+half_halo:H+2*half_halo,:,:]
-                oleft = y[:,:half_halo,:,:]
-                oright = y[:,H-half_halo:,:,:]
-            else:
-                padded_shape = [N,H,W+2*half_halo,C]
-                ypad = torch.empty(shape=padded_shape, dtype=y.dtype, device=y.device, memory_format=torch.contiguous_format)
-                yleft = ypad[:,:,:half_halo,:]
-                ymid = ypad[:,:,half_halo:W+half_halo,:]
-                yright = ypad[:,:,W+half_halo:W+2*half_halo,:]
-                oleft = y[:,:,:half_halo,:]
-                oright = y[:,:,W-half_halo:,:]
-        else:
-            N,C,H,W = list(y.shape)
-            if H_split:
-                padded_shape = [N,C,H+2*half_halo,W]
-                ypad = torch.empty(shape=padded_shape, dtype=y.dtype, device=y.device, memory_format=torch.channels_last)
-                yleft = ypad[:,:,:half_halo,:]
-                ymid = ypad[:,:,half_halo:H+half_halo,:]
-                yright = ypad[:,:,H+half_halo:H+2*half_halo,:]
-                oleft = y[:,:,:half_halo,:]
-                oright = y[:,:,H-half_halo:,:]
-            else:
-                padded_shape = [N,C,H,W+2*half_halo]
-                ypad = torch.empty(shape=padded_shape, dtype=y.dtype, device=y.device, memory_format=torch.channels_last)
-                yleft = ypad[:,:,:,:half_halo]
-                ymid = ypad[:,:,:,half_halo:W+half_halo]
-                yright = ypad[:,:,:,W+half_halo:W+2*half_halo]
-                oleft = y[:,:,:,:half_halo]
-                oright = y[:,:,:,W-half_halo:]
-        with torch.cuda.stream(self.stream1):
-            self.halo_ex(oleft, oright, yleft, yright)
-        with torch.cuda.stream(self.stream2):
-            ymid.copy_(y)
-        return ypad
-    def wait(self):
-        current_stream = torch.cuda.current_stream()
-        current_stream.wait_stream(self.stream1)
-        current_stream.wait_stream(self.stream2)

apex/apex/contrib/bottleneck/test.py DELETED Viewed

@@ -1,71 +0,0 @@
-import torch
-from bottleneck import Bottleneck
-torch.manual_seed(23337)
-# use True to print layerwise sum for all outputs in reference code path
-DEBUG = False#True
-for stride, o_channel in [(1,32), (1,128), (2,32)]:
-    print("testing stride ==", stride, ", in_channel == 32 , out_channel ==", o_channel)
-    a_ = torch.randn(17,32,28,28)
-    a = a_.cuda().half().to(memory_format=torch.channels_last).requires_grad_()
-    model = Bottleneck(32,8,o_channel,stride=stride).cuda().half().to(memory_format=torch.channels_last)
-    # test model
-    b = model(a)
-    b.mean().backward()
-    d_grad = a.grad.float()
-    a.grad = None
-    torch.cuda.synchronize()
-    if DEBUG:
-        print("[DEBUG] ref dx :", d_grad.sum().item())
-        # print wgrad. we don't need to reset since later cpp print before accumulation
-        for i, w in enumerate(model.w_conv):
-            print("[DEBUG] ref wgrad{} :".format(i+1), w.grad.sum().item())
-    wgrads = []
-    for w in model.w_conv:
-        wgrads.append(w.grad.float())
-    model.use_cudnn = True
-    model.zero_grad()
-    c = model(a)
-    c.mean().backward()
-    torch.cuda.synchronize()
-    print("comparing native and channels_last:")
-    print("max error fprop:", (b-c).abs().max().item(), "max elem:", b.abs().max().item())
-    print("max error dgrad:", (d_grad-a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
-    for i, (w, wgrad) in enumerate(zip(model.w_conv, wgrads)):
-        print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
-    nhwc_a = a_.permute(0,2,3,1).contiguous().cuda().half().requires_grad_()
-    nhwc_model = Bottleneck(32,8,o_channel,stride=stride,explicit_nhwc=True, use_cudnn=True).cuda().half()
-    for p,q in zip(model.parameters(), nhwc_model.parameters()):
-        # model's storage is already in nhwc, we clone and assign to explicit nhwc model
-        q.data.copy_(p.data.permute(0,2,3,1).contiguous())
-    for p,q in zip(model.buffers(), nhwc_model.buffers()):
-        q.data.copy_(p.data)
-    d = nhwc_model(nhwc_a)
-    d.mean().backward()
-    torch.cuda.synchronize()
-    # reset reference to cudnn channels_last permute
-    #c_s = c.storage().tolist()
-    #d_s = d.storage().tolist()
-    #print(max([x-y for x,y in zip(c_s,d_s)]))
-    c = c.contiguous(memory_format=torch.contiguous_format).permute(0,2,3,1).contiguous()
-    d_grad = a.grad.float().permute(0,2,3,1).contiguous()
-    wgrads = []
-    for w in model.w_conv:
-        wgrads.append(w.grad.float().permute(0,2,3,1).contiguous())
-    torch.cuda.synchronize()
-    print("comparing nhwc and channels_last:")
-    print("max error fprop:", (d-c).abs().max().item(), "max elem:", c.abs().max().item())
-    print("max error dgrad:", (d_grad-nhwc_a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
-    for i, (w, wgrad) in enumerate(zip(nhwc_model.w_conv, wgrads)):
-        print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())

apex/apex/contrib/clip_grad/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .clip_grad import clip_grad_norm_

apex/apex/contrib/clip_grad/clip_grad.py DELETED Viewed

@@ -1,128 +0,0 @@
-from typing import Union, Iterable
-import torch
-_kernel_import_succeeded = False
-try:
-    import amp_C
-    from apex.multi_tensor_apply import multi_tensor_applier
-    _kernel_import_succeeded = True
-except ImportError:
-    _kernel_import_succeeded = False
-_tensor_or_tensors = Union[torch.Tensor, Iterable[torch.Tensor]]
-def clip_grad_norm_(
-        parameters: _tensor_or_tensors, max_norm: float, norm_type: float = 2.0,
-        error_if_nonfinite: bool = False) -> torch.Tensor:
-    r"""Clips gradient norm of an iterable of parameters.
-    The norm is computed over all gradients together, as if they were
-    concatenated into a single vector. Gradients are modified in-place.
-    This is identical to torch.nn.utils.clip_grad_norm_, except it
-    uses a fused CUDA kernel when computing the 2-norm of GPU tensors
-    in float32 and float16.
-    Args:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
-        error_if_nonfinite (bool): if True, an error is thrown if the total
-            norm of the gradients from :attr:`parameters` is ``nan``,
-            ``inf``, or ``-inf``. Default: False (will switch to True in the future)
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
-    """
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-    parameters = [p for p in parameters if p.grad is not None]
-    max_norm = float(max_norm)
-    norm_type = float(norm_type)
-    # Trivial case
-    if len(parameters) == 0:
-        return torch.tensor(0.)
-    # Fallback implementation
-    if not (_kernel_import_succeeded
-            and norm_type == 2.0
-            and any(p.is_cuda for p in parameters)):
-        return torch.nn.utils.clip_grad_norm_(
-            parameters,
-            max_norm,
-            norm_type=norm_type,
-            error_if_nonfinite = error_if_nonfinite,
-        )
-    # Find fp32 and fp16 gradients on GPU
-    device = next(p.device for p in parameters if p.is_cuda)
-    grads_fp32, grads_fp16, grads_misc = [], [], []
-    for p in parameters:
-        grad = p.grad.detach()
-        if p.dtype == torch.float32 and p.device == device:
-            grads_fp32.append(grad)
-        elif p.dtype == torch.float16 and p.device == device:
-            grads_fp16.append(grad)
-        else:
-            grads_misc.append(grad)
-    # Compute gradient L2 norms
-    norms = []
-    dummy_overflow_buf = torch.zeros([1], dtype=torch.int32, device=device)
-    if grads_fp32:
-        norms.append(
-            multi_tensor_applier(
-                amp_C.multi_tensor_l2norm,
-                dummy_overflow_buf,
-                [grads_fp32],
-                False,
-            )[0]
-        )
-    if grads_fp16:
-        norms.append(
-            multi_tensor_applier(
-                amp_C.multi_tensor_l2norm,
-                dummy_overflow_buf,
-                [grads_fp16],
-                False,
-            )[0],
-        )
-    for g in grads_misc:
-        norms.append(torch.linalg.norm(g).unsqueeze(0).to(device))
-    total_norm = torch.linalg.norm(torch.cat(norms))
-    # Check for non-finite values
-    if error_if_nonfinite and torch.logical_or(total_norm.isnan(), total_norm.isinf()):
-        raise RuntimeError(
-            f'The total norm of order {norm_type} for gradients from '
-            '`parameters` is non-finite, so it cannot be clipped. To disable '
-            'this error and scale the gradients by the non-finite norm anyway, '
-            'set `error_if_nonfinite=False`')
-    # Scale gradients
-    clip_coef = max_norm / (total_norm + 1e-6)
-    clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
-    if grads_fp32:
-        multi_tensor_applier(
-            amp_C.multi_tensor_scale,
-            dummy_overflow_buf,
-            [grads_fp32, grads_fp32],
-            clip_coef_clamped,
-        )
-    if grads_fp16:
-        multi_tensor_applier(
-            amp_C.multi_tensor_scale,
-            dummy_overflow_buf,
-            [grads_fp16, grads_fp16],
-            clip_coef_clamped,
-        )
-    for g in grads_misc:
-        g.mul_(clip_coef_clamped.to(g.device))
-    return total_norm

apex/apex/contrib/conv_bias_relu/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .conv_bias_relu import ConvBiasReLU, ConvBias, ConvBiasMaskReLU, ConvFrozenScaleBiasReLU
2	-

apex/apex/contrib/conv_bias_relu/conv_bias_relu.py DELETED Viewed

@@ -1,104 +0,0 @@
-import pdb
-import torch
-from torch.autograd import gradcheck
-from apex import check_cudnn_version_and_warn
-import fused_conv_bias_relu
-check_cudnn_version_and_warn(__name__, 8400)
-class ConvBiasReLU_(torch.autograd.Function):
-    @staticmethod
-    @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
-    def forward(ctx, x, weight, bias, padding, stride):
-        outputs = fused_conv_bias_relu.forward([x, weight, bias], padding, stride)
-        ctx.save_for_backward(x, weight, outputs[0])
-        ctx.padding = padding
-        ctx.stride = stride
-        return outputs[0]
-    @staticmethod
-    @torch.cuda.amp.custom_bwd
-    def backward(ctx, grad_output):
-        bwd_args = [*ctx.saved_tensors, grad_output]
-        padding = ctx.padding
-        stride = ctx.stride
-        grads = fused_conv_bias_relu.backward(bwd_args, padding, stride)
-        return grads[0], grads[1], grads[2], None, None
-class ConvBiasMaskReLU_(torch.autograd.Function):
-    @staticmethod
-    @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
-    def forward(ctx, x, weight, bias, mask, padding, stride):
-        outputs = fused_conv_bias_relu.forward_mask([x, weight, bias, mask], padding, stride)
-        ctx.save_for_backward(x, weight, outputs[0])
-        ctx.padding = padding
-        ctx.stride = stride
-        return outputs[0]
-    @staticmethod
-    @torch.cuda.amp.custom_bwd
-    def backward(ctx, grad_output):
-        bwd_args = [*ctx.saved_tensors, grad_output]
-        padding = ctx.padding
-        stride = ctx.stride
-        grads = fused_conv_bias_relu.backward(bwd_args, padding, stride)
-        return grads[0], grads[1], grads[2], None, None, None
-class ConvBias_(torch.autograd.Function):
-    @staticmethod
-    @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
-    def forward(ctx, x, weight, bias, padding, stride):
-        outputs = fused_conv_bias_relu.forward_no_relu([x, weight, bias], padding, stride)
-        ctx.save_for_backward(x, weight)
-        ctx.padding = padding
-        ctx.stride = stride
-        return outputs[0]
-    @staticmethod
-    @torch.cuda.amp.custom_bwd
-    def backward(ctx, grad_output):
-        bwd_args = [*ctx.saved_tensors, grad_output]
-        padding = ctx.padding
-        stride = ctx.stride
-        grads = fused_conv_bias_relu.backward_no_relu(bwd_args, padding, stride)
-        return grads[0], grads[1], grads[2], None, None
-class ConvFrozenScaleBiasReLU_(torch.autograd.Function):
-    @staticmethod
-    @torch.cuda.amp.custom_fwd(cast_inputs=torch.half)
-    def forward(ctx, x, weight, scale, bias, padding, stride):
-        output = fused_conv_bias_relu.forward_cscale_cbias_relu([x, weight, scale, bias], padding, stride)
-        ctx.save_for_backward(x, weight, scale, output)
-        ctx.padding = padding
-        ctx.stride = stride
-        return output
-    @staticmethod
-    @torch.cuda.amp.custom_bwd
-    def backward(ctx, grad_output):
-        bwd_args = [*ctx.saved_tensors, grad_output]
-        padding = ctx.padding
-        stride = ctx.stride
-        grads = fused_conv_bias_relu.backward_cscale_cbias_relu(bwd_args, padding, stride)
-        return grads[0], grads[1], None, None, None, None
-ConvBiasReLU = ConvBiasReLU_.apply
-ConvBiasMaskReLU = ConvBiasMaskReLU_.apply
-ConvBias = ConvBias_.apply
-ConvFrozenScaleBiasReLU = ConvFrozenScaleBiasReLU_.apply

apex/apex/contrib/csrc/bottleneck/bottleneck.cpp DELETED Viewed

The diff for this file is too large to render. See raw diff

apex/apex/contrib/csrc/conv_bias_relu/conv_bias_relu.cpp DELETED Viewed

@@ -1,2153 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/cudnn/Handle.h>  // for getcudnnhandle
-#include <torch/extension.h>
-#include <torch/torch.h>
-#include <vector>
-#include <cudnn_frontend.h>
-#include <iostream>
-#ifdef DEBUG
-#define DEBUG_MSG(str) do { std::cout << str << std::endl; } while( false )
-#else
-#define DEBUG_MSG(str) do { } while ( false )
-#endif
-#ifdef DEBUG_CUDNN
-#define DEBUG_CUDNN_MSG(buf, str) do { buf << str << std::endl; } while( false )
-#else
-#define DEBUG_CUDNN_MSG(buf, str) do { } while ( false )
-#endif
-#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(at::MemoryFormat::ChannelsLast), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-#define checkCudnnErr(...)                                                        \
-    do {                                                                          \
-        int err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
-        if (err) {                                                                \
-            return;                                                    \
-	}                                                                         \
-    } while (0)
-int checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
-    if (code) {
-        printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
-        return 1;
-    }
-    return 0;
-}
-void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort = true);
-#define checkCUDAError(val) { checkError((val), #val, __FILE__, __LINE__); }    // in-line regular function
-void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort) {
-  if (code != cudaSuccess)
-  {
-    const char * errorMessage = cudaGetErrorString(code);
-    fprintf(stderr, "CUDA error returned from \"%s\" at %s:%d, Error code: %d (%s)\n", func, file, line, code, errorMessage);
-    if (abort){
-      cudaDeviceReset();
-      exit(code);
-    }
-  }
-}
-void generateStrides(const int64_t* dimA, int64_t* strideA, int nbDims, cudnnTensorFormat_t filterFormat) {
-    // For INT8x4 and INT8x32 we still compute standard strides here to input
-    // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
-    if (filterFormat == CUDNN_TENSOR_NCHW) {
-        strideA[nbDims - 1] = 1;
-        for (int64_t d = nbDims - 2; d >= 0; d--) {
-            strideA[d] = strideA[d + 1] * dimA[d + 1];
-        }
-    } else {
-        // Here we assume that the format is CUDNN_TENSOR_NHWC
-	strideA[1]          = 1;
-        strideA[nbDims - 1] = strideA[1] * dimA[1];
-        for (int64_t d = nbDims - 2; d >= 2; d--) {
-            strideA[d] = strideA[d + 1] * dimA[d + 1];
-        }
-        strideA[0] = strideA[2] * dimA[2];
-    }
-}
-int getFwdConvDilatedFilterDim(int filterDim, int dilation) {
-    return ((filterDim - 1) * dilation) + 1;
-}
-int getFwdConvPaddedImageDim(int tensorDim, int pad) {
-    return tensorDim + (2 * pad);
-}
-int getFwdConvOutputDim(int tensorDim,
-                        int pad,
-                        int filterDim,
-                        int stride,
-                        int dilation) {
-    int p = (getFwdConvPaddedImageDim(tensorDim, pad) - getFwdConvDilatedFilterDim(filterDim, dilation)) / stride + 1;
-    return (p);
-}
-// create a cache for plan
-std::unordered_map<std::string, cudnn_frontend::ExecutionPlan> plan_cache;
-std::string getConvFusionString(int64_t* x_dim_padded,
-                                int64_t* padA,
-                                int64_t* convstrideA,
-                                int64_t* dilationA,
-                                int64_t* w_dim_padded,
-                                cudnnDataType_t dataType,
-                                std::string fusion_string) {
-  for(int i=0;i<4;i++) {
-    fusion_string += 'X';
-    fusion_string += std::to_string(x_dim_padded[i]);
-  }
-  for(int i=0;i<4;i++) {
-    fusion_string += 'W';
-    fusion_string += std::to_string(w_dim_padded[i]);
-  }
-  for(int i=0;i<2;i++) {
-    fusion_string += 'P';
-    fusion_string += std::to_string(padA[i]);
-  }
-  for(int i=0;i<2;i++) {
-    fusion_string += 'S';
-    fusion_string += std::to_string(convstrideA[i]);
-  }
-  for(int i=0;i<2;i++) {
-    fusion_string += 'D';
-    fusion_string += std::to_string(dilationA[i]);
-  }
-  fusion_string += 'T';
-  fusion_string += std::to_string(dataType);
-  return fusion_string;
-}
-cudnn_frontend::ExecutionPlan& getOrCreatePlan(cudnnHandle_t handle_,
-                                               std::stringstream& log_buf,
-                                               cudnn_frontend::OperationGraph& opGraph,
-                                               std::string cache_string,
-                                               bool use_heuristic = true){
-  auto it = plan_cache.find(cache_string);
-  if (it != plan_cache.end()) {
-    DEBUG_CUDNN_MSG(log_buf, "Found plan in cache");
-    return it->second;
-  } else {
-    DEBUG_CUDNN_MSG(log_buf, "No plan in cache");
-    if (use_heuristic) {
-      // TODO: confirm which mode to use
-      auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
-        .setOperationGraph(opGraph)
-        .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
-        .build();
-      auto engine_config_count = heuristics.getEngineConfigCount();
-      auto& engine_configs = heuristics.getEngineConfig(engine_config_count);
-      for (int64_t count = 0; count < engine_config_count; count++) {
-        try {
-          plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder()
-                                                     .setHandle(handle_)
-                                                     .setEngineConfig(engine_configs[count], opGraph.getTag())
-                                                     .build()));
-          break;
-        } catch (cudnn_frontend::cudnnException e) {
-          // Throw exception if all engines failed
-          if (count == (engine_config_count - 1)) {
-            throw e;
-          } else {
-            continue;
-          }
-        }
-      }
-    } else {
-      // How many engines support this operation graph ?
-      auto total_engines = opGraph.getEngineCount();
-      DEBUG_CUDNN_MSG(log_buf, opGraph.describe() << " has " << total_engines << " engines.");
-      // We have to randomly pick one engine from [0, total_engines)
-      // Selecting "0" by default
-      auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build();
-      DEBUG_CUDNN_MSG(log_buf, engine.describe());
-      auto& knobs = engine.getSupportedKnobs();
-      for (auto it = std::begin(knobs); it != std::end(knobs); ++it) {
-        DEBUG_CUDNN_MSG(log_buf, it->describe());
-      }
-      if (knobs.begin() != knobs.end()) {
-        DEBUG_CUDNN_MSG(log_buf, "Updated knob choice");
-        knobs.begin()->setChoice(knobs.begin()->getMinValue() + 1);
-        DEBUG_CUDNN_MSG(log_buf, knobs.begin()->describe());
-      }
-      // Createmplacee the requisite engine config
-      auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
-      DEBUG_CUDNN_MSG(log_buf, engine_config.describe());
-      plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build()));
-    }
-    return plan_cache.find(cache_string)->second;
-  }
-}
-void
-run_conv_bias(int64_t* x_dim,
-              int64_t* w_dim,
-              int64_t* y_dim,
-              int64_t* conv_pad,
-              int64_t* convstride,
-              int64_t* dilation,
-              cudnnDataType_t dataType,
-              at::Half* devPtrX,
-              at::Half* devPtrW,
-              at::Half* devPtrB,
-              at::Half* devPtrY) {
-    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-    std::stringstream log_buf;
-    try {
-        int convDim = 2;
-        float alpha  = 1.0f;
-        float beta   = 0.0f;
-	int64_t b_dim[] = {1, y_dim[1], 1, 1};
-	// Creates the necessary tensor descriptors
-	int64_t stride[4];
-	generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto xTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, x_dim)
-			    .setStrides(4, stride)
-			    .setId('x')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, xTensor.describe());
-	generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto wTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, w_dim)
-			    .setStrides(4, stride)
-			    .setId('w')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, wTensor.describe());
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto afterConvTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, y_dim)
-			    .setStrides(4, stride)
-			    .setId('c')
-			    .setAlignment(16)
-			    .setDataType(CUDNN_DATA_FLOAT)
-			    .setVirtual()
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, afterConvTensor.describe());
-	generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto bTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, b_dim)
-			    .setStrides(4, stride)
-			    .setId('b')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, bTensor.describe());
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto afterBiasTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, y_dim)
-			    .setStrides(4, stride)
-			    .setId('y')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());
-        // Define the bias operation
-        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
-                            .setMode(CUDNN_POINTWISE_ADD)
-                            .setMathPrecision(CUDNN_DATA_FLOAT)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
-        // Define the convolution problem
-        auto convDesc = cudnn_frontend::ConvDescBuilder()
-                            .setDataType(CUDNN_DATA_FLOAT)
-                            .setMathMode(CUDNN_CROSS_CORRELATION)
-                            .setNDims(convDim)
-                            .setStrides(convDim, convstride)
-                            .setPrePadding(convDim, conv_pad)
-                            .setPostPadding(convDim, conv_pad)
-                            .setDilation(convDim, dilation)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
-        // Create a convolution Node
-        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
-                           .setxDesc(xTensor)
-                           .setwDesc(wTensor)
-                           .setyDesc(afterConvTensor)
-                           .setcDesc(convDesc)
-                           .setAlpha(alpha)
-                           .setBeta(beta)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
-        // Create a Bias Node.
-        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                           .setxDesc(conv_op.getOutputTensor())
-                           .setbDesc(bTensor)
-                           .setyDesc(afterBiasTensor)
-                           .setpwDesc(biasDesc)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
-        // Create an Operation Graph. In this case it is convolution bias activation
-        std::array<cudnn_frontend::Operation const*, 2> ops = {&conv_op, &bias_op};
-        auto opGraph = cudnn_frontend::OperationGraphBuilder()
-          .setHandle(handle_)
-          .setOperationGraph(2, ops.data())
-          .build();
-        // Create string encoding for plan caching
-        auto cache_string = getConvFusionString(x_dim, conv_pad, convstride, dilation, w_dim, dataType, opGraph.getTag());
-        DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
-        auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
-        DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
-        auto workspace_size = plan.getWorkspaceSize();
-        DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
-        void* workspace_ptr = nullptr;
-        auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-        if (workspace_size > 0) {
-          workspace_ptr = workspace_tensor.data_ptr<float>();
-        }
-        void* data_ptrs[] = {devPtrX, devPtrW, devPtrB, devPtrY};
-        int64_t uids[]    = {'x', 'w', 'b', 'y'};
-        auto variantPack  = cudnn_frontend::VariantPackBuilder()
-                               .setWorkspacePointer(workspace_ptr)
-          .setDataPointers(4, data_ptrs)
-          .setUids(4, uids)
-                   .build();
-        DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
-        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-        checkCudnnErr(status);
-        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    } catch (cudnn_frontend::cudnnException e) {
-      std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
-    }
-}
-void
-run_conv_bias_mask_relu(int64_t* x_dim,
-                        int64_t* w_dim,
-                        int64_t* y_dim,
-                        int64_t* conv_pad,
-                        int64_t* conv_stride,
-                        int64_t* conv_dilation,
-                        cudnnDataType_t dataType,
-                        at::Half* devPtrX,
-                        at::Half* devPtrW,
-                        at::Half* devPtrB,
-                        int8_t* devPtrM,
-                        at::Half* devPtrY) {
-    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-    std::stringstream log_buf;
-    try {
-        int conv_dim = 2;
-        float alpha  = 1.0f;
-        float beta   = 0.0f;
-	int64_t b_dim[] = {1, y_dim[1], 1, 1};
-	// Creates the necessary tensor descriptors
-	int64_t stride[4];
-	generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto xTensor = cudnn_frontend::TensorBuilder()
-		 	    .setDim(4, x_dim)
-			    .setStrides(4, stride)
-			    .setId('x')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, xTensor.describe());
-	generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto wTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, w_dim)
-			    .setStrides(4, stride)
-			    .setId('w')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, wTensor.describe());
-        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto mTensor = cudnn_frontend::TensorBuilder()
-                            .setDim(4, y_dim)
-                            .setStrides(4, stride)
-                            .setId('m')
-                            .setAlignment(16)
-                            .setDataType(CUDNN_DATA_INT8)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, wTensor.describe());
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto afterConvTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, y_dim)
-			    .setStrides(4, stride)
-			    .setId('c')
-			    .setAlignment(16)
-			    .setDataType(CUDNN_DATA_FLOAT)
-			    .setVirtual()
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, afterConvTensor.describe());
-	generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto bTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, b_dim)
-			    .setStrides(4, stride)
-			    .setId('b')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, bTensor.describe());
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto afterBiasTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, y_dim)
-			    .setStrides(4, stride)
-			    .setId('B')
-			    .setAlignment(16)
-			    .setDataType(CUDNN_DATA_FLOAT)
-			    .setVirtual()
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());
-        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto afterMaskTensor = cudnn_frontend::TensorBuilder()
-                            .setDim(4, y_dim)
-                            .setStrides(4, stride)
-                            .setId('M')
-                            .setAlignment(16)
-                            .setDataType(CUDNN_DATA_FLOAT)
-                            .setVirtual()
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto afterReLUTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, y_dim)
-			    .setStrides(4, stride)
-			    .setId('y')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, afterReLUTensor.describe());
-        // Define the convolution problem
-        auto convDesc = cudnn_frontend::ConvDescBuilder()
-                            .setDataType(CUDNN_DATA_FLOAT)
-                            .setMathMode(CUDNN_CROSS_CORRELATION)
-                            .setNDims(conv_dim)
-                            .setStrides(conv_dim, conv_stride)
-                            .setPrePadding(conv_dim, conv_pad)
-                            .setPostPadding(conv_dim, conv_pad)
-                            .setDilation(conv_dim, conv_dilation)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
-        // Define the bias operation
-        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
-                            .setMode(CUDNN_POINTWISE_ADD)
-                            .setMathPrecision(CUDNN_DATA_FLOAT)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
-	// Define the mask operation
-	auto maskDesc = cudnn_frontend::PointWiseDescBuilder()
-			    .setMode(CUDNN_POINTWISE_MUL)
-			    .setMathPrecision(CUDNN_DATA_FLOAT)
-			    .build();
-        // Define the activation operation
-        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
-                           .setMode(CUDNN_POINTWISE_RELU_FWD)
-                           .setMathPrecision(CUDNN_DATA_FLOAT)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
-        // Create a convolution Node
-        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
-                           .setxDesc(xTensor)
-                           .setwDesc(wTensor)
-                           .setyDesc(afterConvTensor)
-                           .setcDesc(convDesc)
-                           .setAlpha(alpha)
-                           .setBeta(beta)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
-        // Create a Bias Node
-        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                           .setxDesc(conv_op.getOutputTensor())
-                           .setbDesc(bTensor)
-                           .setyDesc(afterBiasTensor)
-                           .setpwDesc(biasDesc)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
-	// create a Mask Node
-	auto mask_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-			   .setxDesc(bias_op.getOutputTensor())
-			   .setbDesc(mTensor)
-			   .setyDesc(afterMaskTensor)
-			   .setpwDesc(maskDesc)
-			   .build();
-	DEBUG_CUDNN_MSG(log_buf, mask_op.describe());
-        // Create an Activation Node
-        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                          .setxDesc(mask_op.getOutputTensor())
-                          .setyDesc(afterReLUTensor)
-                          .setpwDesc(actDesc)
-                          .build();
-        DEBUG_CUDNN_MSG(log_buf, act_op.describe());
-        // Create an Operation Graph. In this case it is convolution bias activation
-       std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &bias_op, &mask_op, &act_op};
-        auto opGraph = cudnn_frontend::OperationGraphBuilder()
-          .setHandle(handle_)
-          .setOperationGraph(4, ops.data())
-          .build();
-        // Create string encoding for plan caching
-        auto cache_string = getConvFusionString(x_dim, conv_pad, conv_stride, conv_dilation, w_dim, dataType, opGraph.getTag());
-        DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
-        auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
-        DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
-        auto workspace_size = plan.getWorkspaceSize();
-        DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
-        void* workspace_ptr = nullptr;
-        auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-        if (workspace_size > 0) {
-          workspace_ptr = workspace_tensor.data_ptr<float>();
-        }
-        void* data_ptrs[] = {devPtrX, devPtrW, devPtrB, devPtrM, devPtrY};
-        int64_t uids[]    = {'x', 'w', 'b', 'm', 'y'};
-        auto variantPack  = cudnn_frontend::VariantPackBuilder()
-          .setWorkspacePointer(workspace_ptr)
-          .setDataPointers(5, data_ptrs)
-          .setUids(5, uids)
-                   .build();
-        DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
-        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-        checkCudnnErr(status);
-        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    } catch (cudnn_frontend::cudnnException e) {
-      std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
-    }
-}
-void
-run_conv_cscale_cbias_relu(int64_t* x_dim,
-                           int64_t* w_dim,
-                           int64_t* y_dim,
-                           int64_t* conv_pad,
-                           int64_t* conv_stride,
-                           int64_t* conv_dilation,
-                           cudnnDataType_t dataType,
-                           at::Half* devPtrX,
-                           at::Half* devPtrW,
-                           at::Half* devPtrS,
-                           at::Half* devPtrB,
-                           at::Half* devPtrY) {
-    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-    std::stringstream log_buf;
-    try {
-        int conv_dim = 2;
-        float alpha  = 1.0f;
-        float beta   = 0.0f;
-        int64_t s_dim[] = {1, y_dim[1], 1, 1};
-        int64_t b_dim[] = {1, y_dim[1], 1, 1};
-        // Creates the necessary tensor descriptors
-        int64_t stride[4];
-        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto xTensor = cudnn_frontend::TensorBuilder()
-                    .setDim(4, x_dim)
-                    .setStrides(4, stride)
-                    .setId('x')
-                    .setAlignment(16)
-                    .setDataType(dataType)
-                    .build();
-        DEBUG_CUDNN_MSG(log_buf, xTensor.describe());
-        generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto wTensor = cudnn_frontend::TensorBuilder()
-                    .setDim(4, w_dim)
-                    .setStrides(4, stride)
-                    .setId('w')
-                    .setAlignment(16)
-                    .setDataType(dataType)
-                    .build();
-        DEBUG_CUDNN_MSG(log_buf, wTensor.describe());
-        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto afterConvTensor = cudnn_frontend::TensorBuilder()
-                    .setDim(4, y_dim)
-                    .setStrides(4, stride)
-                    .setId('c')
-                    .setAlignment(16)
-                    .setDataType(CUDNN_DATA_FLOAT)
-                    .setVirtual()
-                    .build();
-        DEBUG_CUDNN_MSG(log_buf, afterConvTensor.describe());
-        generateStrides(s_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto sTensor = cudnn_frontend::TensorBuilder()
-                    .setDim(4, s_dim)
-                    .setStrides(4, stride)
-                    .setId('s')
-                    .setAlignment(16)
-                    .setDataType(dataType)
-                    .build();
-        DEBUG_CUDNN_MSG(log_buf, sTensor.describe());
-        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto afterScaleTensor = cudnn_frontend::TensorBuilder()
-                    .setDim(4, y_dim)
-                    .setStrides(4, stride)
-                    .setId('S')
-                    .setAlignment(16)
-                    .setDataType(CUDNN_DATA_FLOAT)
-                    .setVirtual()
-                    .build();
-        DEBUG_CUDNN_MSG(log_buf, afterScaleTensor.describe());
-        generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto bTensor = cudnn_frontend::TensorBuilder()
-                    .setDim(4, b_dim)
-                    .setStrides(4, stride)
-                    .setId('b')
-                    .setAlignment(16)
-                    .setDataType(dataType)
-                    .build();
-        DEBUG_CUDNN_MSG(log_buf, bTensor.describe());
-        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto afterBiasTensor = cudnn_frontend::TensorBuilder()
-                    .setDim(4, y_dim)
-                    .setStrides(4, stride)
-                    .setId('B')
-                    .setAlignment(16)
-                    .setDataType(CUDNN_DATA_FLOAT)
-                    .setVirtual()
-                    .build();
-        DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());
-        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto afterReLUTensor = cudnn_frontend::TensorBuilder()
-                    .setDim(4, y_dim)
-                    .setStrides(4, stride)
-                    .setId('y')
-                    .setAlignment(16)
-                    .setDataType(dataType)
-                    .build();
-        DEBUG_CUDNN_MSG(log_buf, afterReLUTensor.describe());
-        // Define the convolution problem
-        auto convDesc = cudnn_frontend::ConvDescBuilder()
-                            .setDataType(CUDNN_DATA_FLOAT)
-                            .setMathMode(CUDNN_CROSS_CORRELATION)
-                            .setNDims(conv_dim)
-                            .setStrides(conv_dim, conv_stride)
-                            .setPrePadding(conv_dim, conv_pad)
-                            .setPostPadding(conv_dim, conv_pad)
-                            .setDilation(conv_dim, conv_dilation)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
-        // Define the scale operation
-        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
-                            .setMode(CUDNN_POINTWISE_MUL)
-                            .setMathPrecision(CUDNN_DATA_FLOAT)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
-        // Define the bias operation
-        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
-                            .setMode(CUDNN_POINTWISE_ADD)
-                            .setMathPrecision(CUDNN_DATA_FLOAT)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
-        // Define the activation operation
-        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
-                           .setMode(CUDNN_POINTWISE_RELU_FWD)
-                           .setMathPrecision(CUDNN_DATA_FLOAT)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
-        // Create a convolution Node
-        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
-                           .setxDesc(xTensor)
-                           .setwDesc(wTensor)
-                           .setyDesc(afterConvTensor)
-                           .setcDesc(convDesc)
-                           .setAlpha(alpha)
-                           .setBeta(beta)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
-        // Create a scale Node.
-        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                           .setxDesc(conv_op.getOutputTensor())
-                           .setbDesc(sTensor)
-                           .setyDesc(afterScaleTensor)
-                           .setpwDesc(scaleDesc)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
-        // Create a Bias Node.
-        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                           .setxDesc(scale_op.getOutputTensor())
-                           .setbDesc(bTensor)
-                           .setyDesc(afterBiasTensor)
-                           .setpwDesc(biasDesc)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
-        // Create an Activation Node.
-        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                          .setxDesc(bias_op.getOutputTensor())
-                          .setyDesc(afterReLUTensor)
-                          .setpwDesc(actDesc)
-                          .build();
-        DEBUG_CUDNN_MSG(log_buf, act_op.describe());
-        // Create an Operation Graph. In this case it is convolution bias activation
-        std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &scale_op, &bias_op, &act_op};
-        auto opGraph = cudnn_frontend::OperationGraphBuilder()
-          .setHandle(handle_)
-          .setOperationGraph(ops.size(), ops.data())
-          .build();
-        // Create string encoding for plan caching
-        auto cache_string = getConvFusionString(x_dim, conv_pad, conv_stride, conv_dilation, w_dim, dataType, opGraph.getTag());
-        DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
-        auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
-        DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
-        auto workspace_size = plan.getWorkspaceSize();
-        DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
-        void* workspace_ptr = nullptr;
-        auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-        if (workspace_size > 0) {
-          workspace_ptr = workspace_tensor.data_ptr<float>();
-        }
-        void* data_ptrs[] = {devPtrX, devPtrW, devPtrS, devPtrB, devPtrY};
-        int64_t uids[]    = {'x', 'w', 's', 'b', 'y'};
-        auto variantPack  = cudnn_frontend::VariantPackBuilder()
-          .setWorkspacePointer(workspace_ptr)
-          .setDataPointers(5, data_ptrs)
-          .setUids(5, uids)
-                   .build();
-        DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
-        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-        checkCudnnErr(status);
-        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    } catch (cudnn_frontend::cudnnException e) {
-      std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
-    }
-}
-void
-run_conv_bias_relu(int64_t* x_dim,
-                   int64_t* w_dim,
-                   int64_t* y_dim,
-                   int64_t* conv_pad,
-                   int64_t* conv_stride,
-                   int64_t* conv_dilation,
-                   cudnnDataType_t dataType,
-                   at::Half* devPtrX,
-                   at::Half* devPtrW,
-                   at::Half* devPtrB,
-                   at::Half* devPtrY) {
-    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-    std::stringstream log_buf;
-    try {
-        int conv_dim = 2;
-        float alpha  = 1.0f;
-        float beta   = 0.0f;
-	int64_t b_dim[] = {1, y_dim[1], 1, 1};
-	// Creates the necessary tensor descriptors
-	int64_t stride[4];
-	generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto xTensor = cudnn_frontend::TensorBuilder()
-		 	    .setDim(4, x_dim)
-			    .setStrides(4, stride)
-			    .setId('x')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, xTensor.describe());
-	generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto wTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, w_dim)
-			    .setStrides(4, stride)
-			    .setId('w')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, wTensor.describe());
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto afterConvTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, y_dim)
-			    .setStrides(4, stride)
-			    .setId('c')
-			    .setAlignment(16)
-			    .setDataType(CUDNN_DATA_FLOAT)
-			    .setVirtual()
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, afterConvTensor.describe());
-	generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto bTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, b_dim)
-			    .setStrides(4, stride)
-			    .setId('b')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, bTensor.describe());
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto afterBiasTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, y_dim)
-			    .setStrides(4, stride)
-			    .setId('B')
-			    .setAlignment(16)
-			    .setDataType(CUDNN_DATA_FLOAT)
-			    .setVirtual()
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, afterBiasTensor.describe());
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto afterReLUTensor = cudnn_frontend::TensorBuilder()
-			    .setDim(4, y_dim)
-			    .setStrides(4, stride)
-			    .setId('y')
-			    .setAlignment(16)
-			    .setDataType(dataType)
-			    .build();
-	DEBUG_CUDNN_MSG(log_buf, afterReLUTensor.describe());
-        // Define the convolution problem
-        auto convDesc = cudnn_frontend::ConvDescBuilder()
-                            .setDataType(CUDNN_DATA_FLOAT)
-                            .setMathMode(CUDNN_CROSS_CORRELATION)
-                            .setNDims(conv_dim)
-                            .setStrides(conv_dim, conv_stride)
-                            .setPrePadding(conv_dim, conv_pad)
-                            .setPostPadding(conv_dim, conv_pad)
-                            .setDilation(conv_dim, conv_dilation)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
-        // Define the bias operation
-        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
-                            .setMode(CUDNN_POINTWISE_ADD)
-                            .setMathPrecision(CUDNN_DATA_FLOAT)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
-        // Define the activation operation
-        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
-                           .setMode(CUDNN_POINTWISE_RELU_FWD)
-                           .setMathPrecision(CUDNN_DATA_FLOAT)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
-        // Create a convolution Node
-        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
-                           .setxDesc(xTensor)
-                           .setwDesc(wTensor)
-                           .setyDesc(afterConvTensor)
-                           .setcDesc(convDesc)
-                           .setAlpha(alpha)
-                           .setBeta(beta)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
-        // Create a Bias Node.
-        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                           .setxDesc(conv_op.getOutputTensor())
-                           .setbDesc(bTensor)
-                           .setyDesc(afterBiasTensor)
-                           .setpwDesc(biasDesc)
-                           .build();
-        DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
-        // Create an Activation Node.
-        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-                          .setxDesc(bias_op.getOutputTensor())
-                          .setyDesc(afterReLUTensor)
-                          .setpwDesc(actDesc)
-                          .build();
-        DEBUG_CUDNN_MSG(log_buf, act_op.describe());
-        // Create an Operation Graph. In this case it is convolution bias activation
-        std::array<cudnn_frontend::Operation const*, 3> ops = {&conv_op, &bias_op, &act_op};
-        auto opGraph = cudnn_frontend::OperationGraphBuilder()
-          .setHandle(handle_)
-          .setOperationGraph(3, ops.data())
-          .build();
-        // Create string encoding for plan caching
-        auto cache_string = getConvFusionString(x_dim, conv_pad, conv_stride, conv_dilation, w_dim, dataType, opGraph.getTag());
-        DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
-        auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
-        DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
-        auto workspace_size = plan.getWorkspaceSize();
-        DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
-        void* workspace_ptr = nullptr;
-        auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-        if (workspace_size > 0) {
-          workspace_ptr = workspace_tensor.data_ptr<float>();
-        }
-        void* data_ptrs[] = {devPtrX, devPtrW, devPtrB, devPtrY};
-        int64_t uids[]    = {'x', 'w', 'b', 'y'};
-        auto variantPack  = cudnn_frontend::VariantPackBuilder()
-          .setWorkspacePointer(workspace_ptr)
-          .setDataPointers(4, data_ptrs)
-          .setUids(4, uids)
-                   .build();
-        DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
-        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-        checkCudnnErr(status);
-        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    } catch (cudnn_frontend::cudnnException e) {
-      std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
-    }
-}
-void
-run_drelu_dscale(int64_t* dy_dim,
-                 cudnnDataType_t dataType,
-                 at::Half* devPtrDY,
-                 at::Half* devPtrR,
-                 at::Half* devPtrS,
-                 at::Half* devPtrDX) {
-    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-    std::stringstream log_buf;
-    try {
-        int convDim = 2;
-        float alpha = 1.0f;
-        float beta = 0.0f;
-        int64_t s_dim[] = {1, dy_dim[1], 1, 1};
-        // Creates the necessary tensor descriptors
-        int64_t stride[4];
-        generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto dyTensor = cudnn_frontend::TensorBuilder()
-                         .setDim(4, dy_dim)
-                         .setStrides(4, stride)
-                         .setId('y')
-                         .setAlignment(16)
-                         .setDataType(dataType)
-                         .build();
-        DEBUG_CUDNN_MSG(log_buf, dyTensor.describe());
-        generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto rTensor = cudnn_frontend::TensorBuilder()
-                         .setDim(4, dy_dim)
-                         .setStrides(4, stride)
-                         .setId('r')
-                         .setAlignment(16)
-                         .setDataType(dataType)
-                         .build();
-        DEBUG_CUDNN_MSG(log_buf, rTensor.describe());
-        generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto inActGradTensor = cudnn_frontend::TensorBuilder()
-                         .setDim(4, dy_dim)
-                         .setStrides(4, stride)
-                         .setId('R')
-                         .setAlignment(16)
-                         .setDataType(CUDNN_DATA_FLOAT)
-                         .setVirtual()
-                         .build();
-        DEBUG_CUDNN_MSG(log_buf, inActGradTensor.describe());
-        generateStrides(s_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto scaleTensor = cudnn_frontend::TensorBuilder()
-          .setDim(4, s_dim)
-          .setStrides(4, stride)
-          .setId('s')
-          .setAlignment(16)
-          .setDataType(dataType)
-          .build();
-        DEBUG_CUDNN_MSG(log_buf, scaleTensor.describe());
-        generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto dxTensor = cudnn_frontend::TensorBuilder()
-          .setDim(4, dy_dim)
-          .setStrides(4, stride)
-          .setId('x')
-          .setAlignment(16)
-          .setDataType(dataType)
-          .build();
-        DEBUG_CUDNN_MSG(log_buf, dxTensor.describe());
-        // Define the activation backward operation
-        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
-          .setMode(CUDNN_POINTWISE_RELU_BWD)
-          .setMathPrecision(CUDNN_DATA_FLOAT)
-          .build();
-        DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
-        // Define the bias backward operation
-        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
-          .setMode(CUDNN_POINTWISE_MUL)
-          .setMathPrecision(CUDNN_DATA_FLOAT)
-          .build();
-        DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
-        // Create an relu backward Node
-        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-          .setdyDesc(dyTensor)
-          .setxDesc(rTensor)
-          .setdxDesc(inActGradTensor)
-          .setpwDesc(actDesc)
-          .build();
-        DEBUG_CUDNN_MSG(log_buf, act_op.describe());
-        // Create bias node
-        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-          .setxDesc(inActGradTensor)
-          .setbDesc(scaleTensor)
-          .setyDesc(dxTensor)
-          .setpwDesc(scaleDesc)
-          .build();
-        DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
-        // Create an Operation Graph. In this case it is bias only
-        std::array<cudnn_frontend::Operation const*, 2> ops = {&act_op, &scale_op};
-        auto opGraph = cudnn_frontend::OperationGraphBuilder()
-          .setHandle(handle_)
-          .setOperationGraph(ops.size(), ops.data())
-          .build();
-        // Create string encoding for plan caching
-        // creating unique dummy values
-        int64_t pad_dummy[] = {40, 40};
-        int64_t stride_dummy[] = {40, 40};
-        int64_t dilation_dummy[] = {40, 40};
-        auto cache_string = getConvFusionString(dy_dim, pad_dummy, stride_dummy, dilation_dummy, s_dim, dataType, opGraph.getTag());
-        DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
-        auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
-        DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
-        auto workspace_size = plan.getWorkspaceSize();
-        DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
-        void* workspace_ptr = nullptr;
-        auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-        if (workspace_size > 0) {
-            workspace_ptr = workspace_tensor.data_ptr<float>();
-        }
-        void* data_ptrs[] = {devPtrDY, devPtrR, devPtrS, devPtrDX};
-        int64_t uids[]    = {'y', 'r', 's', 'x'};
-        auto variantPack  = cudnn_frontend::VariantPackBuilder()
-          .setWorkspacePointer(workspace_ptr)
-          .setDataPointers(4, data_ptrs)
-          .setUids(4, uids)
-          .build();
-        DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
-        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-        checkCudnnErr(status);
-        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    } catch (cudnn_frontend::cudnnException e) {
-        std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
-    }
-}
-void
-run_drelu_dbias(int64_t* dy_dim,
-                cudnnDataType_t dataType,
-                at::Half* devPtrDY,
-		at::Half* devPtrR,
-		at::Half* devPtrDR,
-                float* devPtrDB) {
-    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-    std::stringstream log_buf;
-    try {
-	int convDim = 2;
-	float alpha = 1.0f;
-	float beta = 0.0f;
-	int64_t b_dim[] = {1, dy_dim[1], 1, 1};
-	// Creates the necessary tensor descriptors
-	int64_t stride[4];
-	generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto dyTensor = cudnn_frontend::TensorBuilder()
-		 	 .setDim(4, dy_dim)
-		 	 .setStrides(4, stride)
-			 .setId('x')
-			 .setAlignment(16)
-			 .setDataType(dataType)
-			 .build();
-	DEBUG_CUDNN_MSG(log_buf, dyTensor.describe());
-	generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto rTensor = cudnn_frontend::TensorBuilder()
-		 	 .setDim(4, dy_dim)
-		 	 .setStrides(4, stride)
-			 .setId('r')
-			 .setAlignment(16)
-			 .setDataType(dataType)
-			 .build();
-	DEBUG_CUDNN_MSG(log_buf, rTensor.describe());
-	generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto inActGradTensor = cudnn_frontend::TensorBuilder()
-		 	 .setDim(4, dy_dim)
-		 	 .setStrides(4, stride)
-			 .setId('R')
-			 .setAlignment(16)
-			 .setDataType(dataType)
-			 .build();
-	DEBUG_CUDNN_MSG(log_buf, inActGradTensor.describe());
-        generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto biasGradTensor = cudnn_frontend::TensorBuilder()
-                         .setDim(4, b_dim)
-                         .setStrides(4, stride)
-                         .setId('y')
-                         .setAlignment(16)
-                         .setDataType(CUDNN_DATA_FLOAT)
-                         .build();
-        DEBUG_CUDNN_MSG(log_buf, biasGradTensor.describe());
-	// Define the activation backward operation
-	auto actDesc = cudnn_frontend::PointWiseDescBuilder()
-	  .setMode(CUDNN_POINTWISE_RELU_BWD)
-	  .setMathPrecision(CUDNN_DATA_FLOAT)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
-	// Define the bias backward operation
-	auto biasDesc = cudnn_frontend::ReductionDescBuilder()
-          .setMathPrecision(CUDNN_DATA_FLOAT)
-	  .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
-	// Create an relu backward Node
-	auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-	  .setdyDesc(dyTensor)
-	  .setxDesc(rTensor)
-	  .setdxDesc(inActGradTensor)
-	  .setpwDesc(actDesc)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, act_op.describe());
-	// Create bias node
-	auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-	  .setxDesc(inActGradTensor)
-	  .setyDesc(biasGradTensor)
-	  .setreductionDesc(biasDesc)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
-	// Create an Operation Graph. In this case it is bias only
-	std::array<cudnn_frontend::Operation const*, 2> ops = {&act_op, &bias_op};
-	auto opGraph = cudnn_frontend::OperationGraphBuilder()
-	  .setHandle(handle_)
-	  .setOperationGraph(ops.size(), ops.data())
-	  .build();
-	// Create string encoding for plan caching
-	// creating unique dummy values
-	int64_t pad_dummy[] = {20, 20};
-	int64_t stride_dummy[] = {20, 20};
-	int64_t dilation_dummy[] = {20, 20};
-	auto cache_string = getConvFusionString(dy_dim, pad_dummy, stride_dummy, dilation_dummy, b_dim, dataType, opGraph.getTag());
-	DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
-	auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
-	DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
-	auto workspace_size = plan.getWorkspaceSize();
-	DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
-	void* workspace_ptr = nullptr;
-	auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-	if (workspace_size > 0) {
-	    workspace_ptr = workspace_tensor.data_ptr<float>();
-	}
-	void* data_ptrs[] = {devPtrDY, devPtrR, devPtrDR, devPtrDB};
-	int64_t uids[]    = {'x', 'r', 'R', 'y'};
-	auto variantPack  = cudnn_frontend::VariantPackBuilder()
-	  .setWorkspacePointer(workspace_ptr)
-	  .setDataPointers(4, data_ptrs)
-	  .setUids(4, uids)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
-	cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-	checkCudnnErr(status);
-	cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    } catch (cudnn_frontend::cudnnException e) {
-        std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
-    }
-}
-void
-run_dconv_drelu_dbias(int64_t* x_dim,
-		      int64_t* w_dim,
-                      int64_t* y_dim,
-		      int64_t* pad,
-		      int64_t* convstride,
-		      int64_t* dilation,
-                      cudnnDataType_t dataType,
-                      at::Half* devPtrX,
-		      at::Half* devPtrW,
-                      at::Half* devPtrR,
-                      at::Half* devPtrRg,
-                      float* devPtrY) {
-    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-    std::stringstream log_buf;
-    try {
-	int convDim = 2;
-	float alpha = 1.0f;
-	float beta = 0.0f;
-	int64_t b_dim[] = {1, x_dim[1], 1, 1};
-	int64_t stride[4];
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto outConvGradTensor = cudnn_frontend::TensorBuilder()
-			 .setDim(4, y_dim)
-			 .setStrides(4, stride)
-			 .setId('x')
-			 .setAlignment(16)
-			 .setDataType(dataType)
-			 .build();
-	DEBUG_CUDNN_MSG(log_buf, outConvGradTensor.describe());
-        generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto wTensor = cudnn_frontend::TensorBuilder()
-                         .setDim(4, w_dim)
-                         .setStrides(4, stride)
-                         .setId('w')
-                         .setAlignment(16)
-                         .setDataType(dataType)
-                         .build();
-        DEBUG_CUDNN_MSG(log_buf, wTensor.describe());
-        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto inConvGradTensor = cudnn_frontend::TensorBuilder()
-                         .setDim(4, x_dim)
-                         .setStrides(4, stride)
-                         .setId('A')
-                         .setAlignment(16)
-                         .setDataType(CUDNN_DATA_FLOAT)
-			 .setVirtual()
-                         .build();
-        DEBUG_CUDNN_MSG(log_buf, inConvGradTensor.describe());
-        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto rTensor = cudnn_frontend::TensorBuilder()
-                         .setDim(4, x_dim)
-                         .setStrides(4, stride)
-                         .setId('r')
-                         .setAlignment(16)
-                         .setDataType(dataType)
-                         .build();
-        DEBUG_CUDNN_MSG(log_buf, rTensor.describe());
-        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto inReLUGradTensor = cudnn_frontend::TensorBuilder()
-                         .setDim(4, x_dim)
-                         .setStrides(4, stride)
-                         .setId('R')
-                         .setAlignment(16)
-                         .setDataType(dataType)
-                         .build();
-        DEBUG_CUDNN_MSG(log_buf, inReLUGradTensor.describe());
-        generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
-        auto inBiasGradTensor = cudnn_frontend::TensorBuilder()
-                         .setDim(4, b_dim)
-                         .setStrides(4, stride)
-                         .setId('y')
-                         .setAlignment(16)
-                         .setDataType(CUDNN_DATA_FLOAT)
-                         .build();
-        DEBUG_CUDNN_MSG(log_buf, inBiasGradTensor.describe());
-	// Define the convolution problem
-	auto convDesc = cudnn_frontend::ConvDescBuilder()
-	  .setDataType(CUDNN_DATA_FLOAT)
-	  .setMathMode(CUDNN_CROSS_CORRELATION)
-	  .setNDims(convDim)
-	  .setStrides(convDim, convstride)
-	  .setPrePadding(convDim, pad)
-	  .setPostPadding(convDim, pad)
-	  .setDilation(convDim, dilation)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
-	// Define the activation backward operation
-	auto actDesc = cudnn_frontend::PointWiseDescBuilder()
-	  .setMode(CUDNN_POINTWISE_RELU_BWD)
-	  .setMathPrecision(CUDNN_DATA_FLOAT)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
-	// Define the bias backward operation
-	auto biasDesc = cudnn_frontend::ReductionDescBuilder()
-          .setMathPrecision(CUDNN_DATA_FLOAT)
-	  .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
-	// Create a convolution Node
-	auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
-	  .setdyDesc(outConvGradTensor)
-	  .setwDesc(wTensor)
-	  .setdxDesc(inConvGradTensor)
-	  .setcDesc(convDesc)
-	  .setAlpha(alpha)
-	  .setBeta(beta)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
-	// Create an relu backward Node
-	auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
-	  .setdyDesc(inConvGradTensor)
-	  .setxDesc(rTensor)
-	  .setdxDesc(inReLUGradTensor)
-	  .setpwDesc(actDesc)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, act_op.describe());
-	// Create bias node
-	auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-	  .setxDesc(inReLUGradTensor)
-	  .setyDesc(inBiasGradTensor)
-	  .setreductionDesc(biasDesc)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
-	// Create an Operation Graph. In this case it is bias only
-	std::array<cudnn_frontend::Operation const*, 3> ops = {&conv_op, &act_op, &bias_op};
-	auto opGraph = cudnn_frontend::OperationGraphBuilder()
-	  .setHandle(handle_)
-	  .setOperationGraph(ops.size(), ops.data())
-	  .build();
-	// Create string encoding for plan caching
-	auto cache_string = getConvFusionString(x_dim, pad, convstride, dilation, w_dim, dataType, opGraph.getTag());
-	DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
-	auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
-	DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
-	auto workspace_size = plan.getWorkspaceSize();
-	DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
-	void* workspace_ptr = nullptr;
-	auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-	if (workspace_size > 0) {
-	    workspace_ptr = workspace_tensor.data_ptr<float>();
-	}
-	void* data_ptrs[] = {devPtrX, devPtrW, devPtrR, devPtrRg, devPtrY};
-	int64_t uids[]    = {'x', 'w', 'r', 'R', 'y'};
-	auto variantPack  = cudnn_frontend::VariantPackBuilder()
-	  .setWorkspacePointer(workspace_ptr)
-	  .setDataPointers(5, data_ptrs)
-	  .setUids(5, uids)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
-	cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-	checkCudnnErr(status);
-	cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    } catch (cudnn_frontend::cudnnException e) {
-        std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
-    }
-}
-void
-run_dconv(int64_t* x_dim,
-          int64_t* w_dim,
-          int64_t* y_dim,
-          int64_t* conv_pad,
-          int64_t* conv_stride,
-          int64_t* conv_dilation,
-          cudnnDataType_t dataType,
-          at::Half* devPtrX,
-          at::Half* devPtrW,
-          at::Half* devPtrY,
-          cudnnBackendDescriptorType_t mode) {
-    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-    std::stringstream log_buf;
-    try {
-        int conv_dim = 2;
-        float alpha  = 1.0f;
-        float beta   = 0.0f;
-        // Define the convolution problem
-	int64_t stride[4];
-	generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto xTensor = cudnn_frontend::TensorBuilder()
-			  .setDim(4, x_dim)
-			  .setStrides(4, stride)
-			  .setId('x')
-			  .setAlignment(16)
-			  .setDataType(dataType)
-			  .build();
-	DEBUG_CUDNN_MSG(log_buf, xTensor.describe());
-	generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto wTensor = cudnn_frontend::TensorBuilder()
-			  .setDim(4, w_dim)
-			  .setStrides(4, stride)
-			  .setId('w')
-			  .setAlignment(16)
-			  .setDataType(dataType)
-			  .build();
-	DEBUG_CUDNN_MSG(log_buf, wTensor.describe());
-	generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto yTensor = cudnn_frontend::TensorBuilder()
-			  .setDim(4, y_dim)
-			  .setStrides(4, stride)
-			  .setId('y')
-			  .setAlignment(16)
-			  .setDataType(dataType)
-			  .build();
-	DEBUG_CUDNN_MSG(log_buf, yTensor.describe());
-        // Define the convolution problem
-        auto convDesc = cudnn_frontend::ConvDescBuilder()
-                            .setDataType(CUDNN_DATA_FLOAT)
-                            .setMathMode(CUDNN_CROSS_CORRELATION)
-                            .setNDims(conv_dim)
-                            .setStrides(conv_dim, conv_stride)
-                            .setPrePadding(conv_dim, conv_pad)
-                            .setPostPadding(conv_dim, conv_pad)
-                            .setDilation(conv_dim, conv_dilation)
-                            .build();
-        DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
-        // Create a convolution node
-        // mode should be one of following
-        // CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
-        // CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
-        auto conv_op_builder = cudnn_frontend::OperationBuilder(mode);
-        if (mode == CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
-          conv_op_builder.setdxDesc(xTensor)
-            .setwDesc(wTensor)
-            .setdyDesc(yTensor)
-            .setcDesc(convDesc);
-        }
-        else {
-          conv_op_builder.setxDesc(xTensor)
-            .setdwDesc(wTensor)
-            .setdyDesc(yTensor)
-            .setcDesc(convDesc);
-        }
-        auto conv_op = conv_op_builder
-		         .setAlpha(alpha)
-			 .setBeta(beta)
-			 .build();
-        DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
-        // Create an Operation Graph. In this case it is convolution add bias activation
-        std::array<cudnn_frontend::Operation const*, 1> ops = {&conv_op};
-        auto opGraph = cudnn_frontend::OperationGraphBuilder()
-          .setHandle(handle_)
-          .setOperationGraph(ops.size(), ops.data())
-          .build();
-        // Create string encoding for plan caching
-        auto cache_string = getConvFusionString(x_dim, conv_pad, conv_stride, conv_dilation, w_dim, dataType, opGraph.getTag());
-        DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
-        auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
-        DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
-        auto workspace_size = plan.getWorkspaceSize();
-        DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
-        void* workspace_ptr = nullptr;
-        auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-        if (workspace_size > 0) {
-          workspace_ptr = workspace_tensor.data_ptr<float>();
-        }
-        void* data_ptrs[] = {devPtrX, devPtrW, devPtrY};
-        int64_t uids[]    = {'x', 'w', 'y'};
-        auto variantPack  = cudnn_frontend::VariantPackBuilder()
-          .setWorkspacePointer(workspace_ptr)
-          .setDataPointers(3, data_ptrs)
-          .setUids(3, uids)
-          .build();
-        DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
-        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-        checkCudnnErr(status);
-        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    } catch (cudnn_frontend::cudnnException e) {
-      std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
-    }
-}
-void
-run_dbias(int64_t* x_dim,
-          cudnnDataType_t dataType,
-          at::Half* devPtrX,
-          float* devPtrY) {
-    cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-    std::stringstream log_buf;
-    try {
-	int convDim = 2;
-	int64_t b_dim[] = {1, x_dim[1], 1, 1};
-	int64_t stride[4];
-	generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto xTensor = cudnn_frontend::TensorBuilder()
-			 .setDim(4, x_dim)
-			 .setStrides(4, stride)
-			 .setId('x')
-			 .setAlignment(16)
-			 .setDataType(dataType)
-			 .build();
-	DEBUG_CUDNN_MSG(log_buf, xTensor.describe());
-	generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
-	auto yTensor = cudnn_frontend::TensorBuilder()
-			 .setDim(4, b_dim)
-			 .setStrides(4, stride)
-			 .setId('y')
-			 .setAlignment(16)
-			 .setDataType(CUDNN_DATA_FLOAT)
-			 .build();
-	DEBUG_CUDNN_MSG(log_buf, yTensor.describe());
-	// Define the bias backward operation
-	auto biasDesc = cudnn_frontend::ReductionDescBuilder()
-          .setMathPrecision(CUDNN_DATA_FLOAT)
-	  .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
-	// Create bias node
-	auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
-	  .setxDesc(xTensor)
-	  .setyDesc(yTensor)
-	  .setreductionDesc(biasDesc)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
-	// Create an Operation Graph. In this case it is bias only
-	std::array<cudnn_frontend::Operation const*, 1> ops = {&bias_op};
-	auto opGraph = cudnn_frontend::OperationGraphBuilder()
-	  .setHandle(handle_)
-	  .setOperationGraph(ops.size(), ops.data())
-	  .build();
-	// Create string encoding for plan caching
-	int64_t pad_dummy[] = {10, 10};
-	int64_t stride_dummy[] = {10, 10};
-	int64_t dilation_dummy[] = {10, 10};
-	auto cache_string = getConvFusionString(x_dim, pad_dummy, stride_dummy, dilation_dummy, b_dim, dataType, opGraph.getTag());
-	DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
-	auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
-	DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
-	auto workspace_size = plan.getWorkspaceSize();
-	DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
-	void* workspace_ptr = nullptr;
-	auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-	if (workspace_size > 0) {
-	    workspace_ptr = workspace_tensor.data_ptr<float>();
-	}
-	void* data_ptrs[] = {devPtrX, devPtrY};
-	int64_t uids[]    = {'x', 'y'};
-	auto variantPack  = cudnn_frontend::VariantPackBuilder()
-	  .setWorkspacePointer(workspace_ptr)
-	  .setDataPointers(2, data_ptrs)
-	  .setUids(2, uids)
-	  .build();
-	DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
-	cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-	checkCudnnErr(status);
-	cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    } catch (cudnn_frontend::cudnnException e) {
-        std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
-    }
-}
-std::vector<at::Tensor> conv_bias_mask_relu_forward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
-  std::cout << std::fixed;
-  // create output vector
-  std::vector<at::Tensor> outputs;
-  auto output_format = at::MemoryFormat::ChannelsLast;
-  // setup dimensions
-  int64_t x_dim[]        = {0, 0, 0, 0};
-  int64_t w_dim[]        = {0, 0, 0, 0};
-  // All dim calculation after this order of n,c,h,w
-  int axis[] = {0, 1, 2, 3};
-  for (int dim = 0; dim < 4; dim++) {
-    x_dim[dim] = inputs[0].size(axis[dim]);
-    w_dim[dim] = inputs[1].size(axis[dim]);
-  }
-  // output dim in n,c,h,w used by backend
-  int64_t y_dim[]     = {0, 0, 0, 0};
-  // use these fixed values
-  int64_t conv_pad[]        = {padding, padding};
-  int64_t conv_stride[]     = {stride, stride};
-  int64_t conv_dilation[]   = {1, 1};
-  // compute output from pad/stride/dilation
-  y_dim[0] = x_dim[0];
-  y_dim[1] = w_dim[0];
-  for (int dim = 0; dim < 2; dim++) {
-    y_dim[dim + 2] = getFwdConvOutputDim(x_dim[dim + 2], conv_pad[dim], w_dim[dim + 2], conv_stride[dim], conv_dilation[dim]);
-  }
-  // run
-  at::Half* x = inputs[0].data_ptr<at::Half>();
-  at::Half* w = inputs[1].data_ptr<at::Half>();
-  at::Half* b = inputs[2].data_ptr<at::Half>();
-  int8_t* m = inputs[3].data_ptr<int8_t>();
-  auto out = at::empty(y_dim, inputs[0].type(), output_format);
-  at::Half* y = out.data_ptr<at::Half>();
-  run_conv_bias_mask_relu(x_dim,
-		          w_dim,
-      		          y_dim,
-		          conv_pad,
-		          conv_stride,
-		          conv_dilation,
-		          CUDNN_DATA_HALF,
-		          x,
-		          w,
-		          b,
-		          m,
-		          y);
-  DEBUG_MSG("[DEBUG] conv-bias-mask-relu : " << y.to(at::kFloat).sum().item<float>());
-  outputs.push_back(out);
-  return outputs;
-}
-at::Tensor conv_cscale_cbias_relu_forward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
-  std::cout << std::fixed;
-  // setup dimensions
-  int64_t x_dim[]        = {0, 0, 0, 0};
-  int64_t w_dim[]        = {0, 0, 0, 0};
-  // All dim calculation after this order of n,c,h,w
-  int axis[] = {0, 1, 2, 3};
-  for (int dim = 0; dim < 4; dim++) {
-    x_dim[dim] = inputs[0].size(axis[dim]);
-    w_dim[dim] = inputs[1].size(axis[dim]);
-  }
-  // output dim in n,c,h,w used by backend
-  int64_t y_dim[]     = {0, 0, 0, 0};
-  // use these fixed values
-  int64_t conv_pad[]        = {padding, padding};
-  int64_t conv_stride[]     = {stride, stride};
-  int64_t conv_dilation[]   = {1, 1};
-  // compute output from pad/stride/dilation
-  y_dim[0] = x_dim[0];
-  y_dim[1] = w_dim[0];
-  for (int dim = 0; dim < 2; dim++) {
-    y_dim[dim + 2] = getFwdConvOutputDim(x_dim[dim + 2], conv_pad[dim], w_dim[dim + 2], conv_stride[dim], conv_dilation[dim]);
-  }
-  // run
-  at::Half* x = inputs[0].data_ptr<at::Half>();
-  at::Half* w = inputs[1].data_ptr<at::Half>();
-  at::Half* s = inputs[2].data_ptr<at::Half>();
-  at::Half* b = inputs[3].data_ptr<at::Half>();
-  auto out = at::empty(y_dim, inputs[0].type(), at::MemoryFormat::ChannelsLast);
-  at::Half* y = out.data_ptr<at::Half>();
-  run_conv_cscale_cbias_relu(x_dim,
-                             w_dim,
-                             y_dim,
-                             conv_pad,
-                             conv_stride,
-                             conv_dilation,
-                             CUDNN_DATA_HALF,
-                             x,
-                             w,
-                             s,
-                             b,
-                             y);
-  DEBUG_MSG("[DEBUG] conv-cscale-cbias-relu : " << y.to(at::kFloat).sum().item<float>());
-  return out;
-}
-std::vector<at::Tensor> conv_cscale_cbias_relu_backward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
-  bool requires_grad = inputs[0].requires_grad();
-  for (int i = 0; i <= 4; i++) {
-    CHECK_INPUT(inputs[i]);
-  }
-  std::cout << std::fixed;
-  // create output vector
-  std::vector<at::Tensor> outputs;
-  auto output_format = at::MemoryFormat::ChannelsLast;
-  // setup dimensions
-  int64_t x_dim[]    = {0, 0, 0, 0};
-  int64_t w_dim[]	 = {0, 0, 0, 0};
-  int64_t y_dim[]	 = {0, 0, 0, 0};
-  // All dim calculation after this order of n,c,h,w
-  int axis[] = {0, 1, 2, 3};
-  for (int dim = 0; dim < 4; dim++) {
-    x_dim[dim] = inputs[0].size(axis[dim]);
-    w_dim[dim] = inputs[1].size(axis[dim]);
-    y_dim[dim] = inputs[3].size(axis[dim]);
-  }
-  int64_t b_dim[]       = {1, y_dim[1], 1, 1};
-  int64_t conv_pad[]        = {padding, padding};
-  int64_t conv_stride[]     = {stride, stride};
-  int64_t conv_dilation[]   = {1, 1};
-  // run
-  // drelu-dbias
-  at::Half* dy = inputs[4].data_ptr<at::Half>();
-  at::Half* r = inputs[3].data_ptr<at::Half>();
-  auto s = inputs[2].data_ptr<at::Half>();
-  auto dscale = at::empty_like(inputs[4]);
-  at::Half* ds = dscale.data_ptr<at::Half>();
-  auto options = at::TensorOptions().dtype(at::kFloat).layout(inputs[0].layout()).device(inputs[0].device()).requires_grad(false);
-  run_drelu_dscale(y_dim,
-		   CUDNN_DATA_HALF,
-		   dy,
-		   r,
-		   s,
-		   ds);
-  // conv wgrad
-  at::Half* x = inputs[0].data_ptr<at::Half>();
-  auto wgrad = at::empty_like(inputs[1]);
-  at::Half* dw = wgrad.data_ptr<at::Half>();
-  run_dconv(x_dim,
-	    w_dim,
-	    y_dim,
-	    conv_pad,
-	    conv_stride,
-	    conv_dilation,
-	    CUDNN_DATA_HALF,
-	    x,
-	    dw,
-	    ds,
-	    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
-  // conv dgrad
-  at::Half* w = inputs[1].data_ptr<at::Half>();
-  auto dgrad = at::empty_like(inputs[0]);
-  at::Half* dx = dgrad.data_ptr<at::Half>();
-  run_dconv(x_dim,
-	    w_dim,
-	    y_dim,
-	    conv_pad,
-	    conv_stride,
-	    conv_dilation,
-	    CUDNN_DATA_HALF,
-	    dx,
-	    w,
-	    ds,
-	    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
-  outputs.push_back(dgrad);
-  outputs.push_back(wgrad);
-  return outputs;
-}
-std::vector<at::Tensor> conv_bias_relu_forward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
-  std::cout << std::fixed;
-  // create output vector
-  std::vector<at::Tensor> outputs;
-  auto output_format = at::MemoryFormat::ChannelsLast;
-  // setup dimensions
-  int64_t x_dim[]        = {0, 0, 0, 0};
-  int64_t w_dim[]        = {0, 0, 0, 0};
-  // All dim calculation after this order of n,c,h,w
-  int axis[] = {0, 1, 2, 3};
-  for (int dim = 0; dim < 4; dim++) {
-    x_dim[dim] = inputs[0].size(axis[dim]);
-    w_dim[dim] = inputs[1].size(axis[dim]);
-  }
-  // output dim in n,c,h,w used by backend
-  int64_t y_dim[]     = {0, 0, 0, 0};
-  // use these fixed values
-  int64_t conv_pad[]        = {padding, padding};
-  int64_t conv_stride[]     = {stride, stride};
-  int64_t conv_dilation[]   = {1, 1};
-  // compute output from pad/stride/dilation
-  y_dim[0] = x_dim[0];
-  y_dim[1] = w_dim[0];
-  for (int dim = 0; dim < 2; dim++) {
-    y_dim[dim + 2] = getFwdConvOutputDim(x_dim[dim + 2], conv_pad[dim], w_dim[dim + 2], conv_stride[dim], conv_dilation[dim]);
-  }
-  // run
-  at::Half* x = inputs[0].data_ptr<at::Half>();
-  at::Half* w = inputs[1].data_ptr<at::Half>();
-  at::Half* b = inputs[2].data_ptr<at::Half>();
-  auto out = at::empty(y_dim, inputs[0].type(), output_format);
-  at::Half* y = out.data_ptr<at::Half>();
-  run_conv_bias_relu(x_dim,
-		     w_dim,
-		     y_dim,
-		     conv_pad,
-		     conv_stride,
-		     conv_dilation,
-		     CUDNN_DATA_HALF,
-		     x,
-		     w,
-		     b,
-		     y);
-  DEBUG_MSG("[DEBUG] conv-bias-relu : " << y.to(at::kFloat).sum().item<float>());
-  outputs.push_back(out);
-  return outputs;
-}
-std::vector<at::Tensor> conv_bias_relu_backward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
-  bool requires_grad = inputs[0].requires_grad();
-  for (int i = 0; i <= 3; i++) {
-    CHECK_INPUT(inputs[i]);
-  }
-  std::cout << std::fixed;
-  // create output vector
-  std::vector<at::Tensor> outputs;
-  auto output_format = at::MemoryFormat::ChannelsLast;
-  // setup dimensions
-  int64_t x_dim[]        = {0, 0, 0, 0};
-  int64_t w_dim[]	 = {0, 0, 0, 0};
-  int64_t y_dim[]	 = {0, 0, 0, 0};
-  // All dim calculation after this order of n,c,h,w
-  int axis[] = {0, 1, 2, 3};
-  for (int dim = 0; dim < 4; dim++) {
-    x_dim[dim] = inputs[0].size(axis[dim]);
-    w_dim[dim] = inputs[1].size(axis[dim]);
-    y_dim[dim] = inputs[3].size(axis[dim]);
-  }
-  int64_t b_dim[]       = {1, y_dim[1], 1, 1};
-  int64_t conv_pad[]        = {padding, padding};
-  int64_t conv_stride[]     = {stride, stride};
-  int64_t conv_dilation[]   = {1, 1};
-  // run
-  // drelu-dbias
-  at::Half* dy = inputs[3].data_ptr<at::Half>();
-  at::Half* r = inputs[2].data_ptr<at::Half>();
-  auto drelu = at::empty_like(inputs[2]);
-  at::Half* dr = drelu.data_ptr<at::Half>();
-  auto options = at::TensorOptions().dtype(at::kFloat).layout(inputs[0].layout()).device(inputs[0].device()).requires_grad(false);
-  auto bgrad = at::empty(b_dim, options, output_format);
-  float* db = bgrad.data_ptr<float>();
-  run_drelu_dbias(y_dim,
-		  CUDNN_DATA_HALF,
-		  dy,
-		  r,
-		  dr,
-		  db);
-  // conv wgrad
-  at::Half* x = inputs[0].data_ptr<at::Half>();
-  auto wgrad = at::empty_like(inputs[1]);
-  at::Half* dw = wgrad.data_ptr<at::Half>();
-  run_dconv(x_dim,
-	    w_dim,
-	    y_dim,
-	    conv_pad,
-	    conv_stride,
-	    conv_dilation,
-	    CUDNN_DATA_HALF,
-	    x,
-	    dw,
-	    dr,
-	    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
-  // conv dgrad
-  at::Half* w = inputs[1].data_ptr<at::Half>();
-  auto dgrad = at::empty_like(inputs[0]);
-  at::Half* dx = dgrad.data_ptr<at::Half>();
-  run_dconv(x_dim,
-	    w_dim,
-	    y_dim,
-	    conv_pad,
-	    conv_stride,
-	    conv_dilation,
-	    CUDNN_DATA_HALF,
-	    dx,
-	    w,
-	    dr,
-	    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
-  outputs.push_back(dgrad);
-  outputs.push_back(wgrad);
-  outputs.push_back(bgrad);
-  return outputs;
-}
-std::vector<at::Tensor> conv_bias_forward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
-  std::cout << std::fixed;
-  // create output vector
-  std::vector<at::Tensor> outputs;
-  auto output_format = at::MemoryFormat::ChannelsLast;
-  // setup dimensions
-  int64_t x_dim[]        = {0, 0, 0, 0};
-  int64_t w_dim[]        = {0, 0, 0, 0};
-  // All dim calculation after this order of n,c,h,w
-  int axis[] = {0, 1, 2, 3};
-  for (int dim = 0; dim < 4; dim++) {
-    x_dim[dim] = inputs[0].size(axis[dim]);
-    w_dim[dim] = inputs[1].size(axis[dim]);
-  }
-  // output dim in n,c,h,w used by backend
-  int64_t y_dim[]     = {0, 0, 0, 0};
-  // use these fixed values
-  int64_t conv_pad[]        = {padding, padding};
-  int64_t conv_stride[]     = {stride, stride};
-  int64_t conv_dilation[]   = {1, 1};
-  // compute output from pad/stride/dilation
-  y_dim[0] = x_dim[0];
-  y_dim[1] = w_dim[0];
-  for (int dim = 0; dim < 2; dim++) {
-    y_dim[dim + 2] = getFwdConvOutputDim(x_dim[dim + 2], conv_pad[dim], w_dim[dim + 2], conv_stride[dim], conv_dilation[dim]);
-  }
-  // run
-  at::Half* x = inputs[0].data_ptr<at::Half>();
-  at::Half* w = inputs[1].data_ptr<at::Half>();
-  at::Half* b = inputs[2].data_ptr<at::Half>();
-  auto out = at::empty(y_dim, inputs[0].type(), output_format);
-  at::Half* y = out.data_ptr<at::Half>();
-  run_conv_bias(x_dim,
-	        w_dim,
-	        y_dim,
-	        conv_pad,
-	        conv_stride,
-	        conv_dilation,
-	        CUDNN_DATA_HALF,
-	        x,
-	        w,
-	        b,
-	        y);
-  DEBUG_MSG("[DEBUG] conv-bias : " << y.to(at::kFloat).sum().item<float>());
-  outputs.push_back(out);
-  return outputs;
-}
-std::vector<at::Tensor> conv_bias_backward(std::vector<at::Tensor> inputs, int64_t padding, int64_t stride) {
-  bool requires_grad = inputs[0].requires_grad();
-  for (int i = 0; i <= 2; i++) {
-    CHECK_INPUT(inputs[i]);
-  }
-  std::cout << std::fixed;
-  // create output vector
-  std::vector<at::Tensor> outputs;
-  auto output_format = at::MemoryFormat::ChannelsLast;
-  // setup dimensions
-  int64_t x_dim[]        = {0, 0, 0, 0};
-  int64_t w_dim[]	 = {0, 0, 0, 0};
-  int64_t y_dim[]	 = {0, 0, 0, 0};
-  // All dim calculation after this order of n,c,h,w
-  int axis[] = {0, 1, 2, 3};
-  for (int dim = 0; dim < 4; dim++) {
-    x_dim[dim] = inputs[0].size(axis[dim]);
-    w_dim[dim] = inputs[1].size(axis[dim]);
-    y_dim[dim] = inputs[2].size(axis[dim]);
-  }
-  int64_t b_dim[]       = {1, y_dim[1], 1, 1};
-  int64_t conv_pad[]        = {padding, padding};
-  int64_t conv_stride[]     = {stride, stride};
-  int64_t conv_dilation[]   = {1, 1};
-  // run
-  // dbias
-  at::Half* dy = inputs[2].data_ptr<at::Half>();
-  auto options = at::TensorOptions().dtype(at::kFloat).layout(inputs[0].layout()).device(inputs[0].device()).requires_grad(false);
-  auto bgrad = at::empty(b_dim, options, output_format);
-  float* db = bgrad.data_ptr<float>();
-  run_dbias(y_dim,
-            CUDNN_DATA_HALF,
-            dy,
-            db);
-  // conv wgrad
-  at::Half* x = inputs[0].data_ptr<at::Half>();
-  auto wgrad = at::empty_like(inputs[1]);
-  at::Half* dw = wgrad.data_ptr<at::Half>();
-  run_dconv(x_dim,
-	    w_dim,
-	    y_dim,
-	    conv_pad,
-	    conv_stride,
-	    conv_dilation,
-	    CUDNN_DATA_HALF,
-	    x,
-	    dw,
-	    dy,
-	    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
-  // conv dgrad
-  at::Half* w = inputs[1].data_ptr<at::Half>();
-  auto dgrad = at::empty_like(inputs[0]);
-  at::Half* dx = dgrad.data_ptr<at::Half>();
-  run_dconv(x_dim,
-	    w_dim,
-	    y_dim,
-	    conv_pad,
-	    conv_stride,
-	    conv_dilation,
-	    CUDNN_DATA_HALF,
-	    dx,
-	    w,
-	    dy,
-	    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
-  outputs.push_back(dgrad);
-  outputs.push_back(wgrad);
-  outputs.push_back(bgrad);
-  return outputs;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &conv_bias_relu_forward, "Fused Conv-Bias-ReLU forward");
-  m.def("backward", &conv_bias_relu_backward, "Fused Conv-Bias-ReLU backward");
-  m.def("forward_no_relu", &conv_bias_forward, "Fused Conv-Bias forward");
-  m.def("backward_no_relu", &conv_bias_backward, "Fused Conv-Bias backward");
-  m.def("forward_mask", &conv_bias_mask_relu_forward, "Fused Conv-Bias-Mask-ReLU forward");
-  m.def("forward_cscale_cbias_relu", &conv_cscale_cbias_relu_forward, "Fused Conv-(const)Scale-(const)Bias-ReLU");
-  m.def("backward_cscale_cbias_relu", &conv_cscale_cbias_relu_backward, "Fused Conv-(const)Scale-(const)Bias-ReLU backward");
-}

apex/apex/contrib/csrc/cudnn_gbn/cudnn_gbn.cpp DELETED Viewed

@@ -1,163 +0,0 @@
-#include <ATen/ATen.h>
-#include <torch/extension.h>
-#include <torch/torch.h>
-#include <vector>
-#include <iostream>
-#include "norm_sample.h"
-// define this enum:
-enum bn_type { BN_FWD, BN_BWD };
-// this is a global variable
-static std::map<std::vector<int64_t>, cudnn_frontend::ExecutionPlan> gbn_plan_cache;
-at::Tensor gbn_forward(const at::Tensor& x,
-                       const at::Tensor& scale,
-                       const at::Tensor& bias,
-                       const at::Tensor& running_mean,
-                       const at::Tensor& running_var,
-                       const at::Tensor& minibatch_mean,
-                       const at::Tensor& minibatch_inv_var,
-                       const float momentum,
-                       const float epsilon,
-                       const int64_t bn_group,
-                       const int rank_id,
-                       const std::vector<int64_t> &peer_buffers) {
-  int64_t N = x.size(0);
-  int64_t C = x.size(1);
-  int64_t H = x.size(2);
-  int64_t W = x.size(3);
-  int64_t tensorDims[]     = {N, C, H, W};
-  int64_t peerDims[]       = {bn_group, 4*C, 1, 1};
-  int64_t perChannelDims[] = {1, C, 1, 1};
-  int64_t epsilonDims[]    = {1, 1, 1, 1};
-  // Allocate output tensor
-  at::Tensor y = at::empty_like(x);
-  std::vector<void*> void_peer_buffers;
-  for (int64_t addr : peer_buffers) {
-    void_peer_buffers.push_back((void*)addr);
-  }
-  // we need the peer size for the buffer reset
-  size_t peer_size = 1;
-  for (size_t i = 0; i < 4; ++i){
-    peer_size *= peerDims[i];
-  }
-  // sanity check
-  assert(bn_group == void_peer_buffers.size());
-  // check if plan already exists
-  std::vector<int64_t> fv = {(int64_t)BN_FWD, N, C, H, W, bn_group, (int64_t)CUDNN_DATA_HALF};
-  if ( gbn_plan_cache.find(fv) == gbn_plan_cache.end() ) {
-    auto plan = run_batch_norm_forward(tensorDims, perChannelDims, epsilonDims, peerDims, CUDNN_DATA_HALF);
-    gbn_plan_cache.emplace(fv, std::move(plan));
-  }
-  // get plan and handle
-  auto plan = gbn_plan_cache.find(fv)->second;
-  // execute
-  execute_batch_norm_forward(plan,
-			     x.data_ptr(),
-			     y.data_ptr(),
-			     scale.data_ptr(),
-			     bias.data_ptr(),
-			     running_mean.data_ptr(),
-			     running_var.data_ptr(),
-			     running_mean.data_ptr(),
-			     running_var.data_ptr(),
-			     minibatch_mean.data_ptr(),
-			     minibatch_inv_var.data_ptr(),
-			     void_peer_buffers,
-			     static_cast<double>(epsilon),
-			     static_cast<double>(momentum),
-			     peer_size,
-			     rank_id);
-  return y;
-}
-std::vector<at::Tensor> gbn_backward(
-                       const at::Tensor& x,
-                       const at::Tensor& dy,
-                       const at::Tensor& scale,
-                       const at::Tensor& minibatch_mean,
-                       const at::Tensor& minibatch_inv_var,
-                       const float epsilon,
-                       const int64_t bn_group,
-                       const int rank_id,
-                       const std::vector<int64_t> &peer_buffers) {
-  int64_t N = x.size(0);
-  int64_t C = x.size(1);
-  int64_t H = x.size(2);
-  int64_t W = x.size(3);
-  int64_t tensorDims[]     = {N, C, H, W};
-  int64_t peerDims[]       = {bn_group, 4*C, 1, 1};
-  int64_t perChannelDims[] = {1, C, 1, 1};
-  int64_t epsilonDims[]    = {1, 1, 1, 1};
-  // Allocate output tensor
-  // outputs
-  at::Tensor x_grad, scale_grad, bias_grad;
-  // Allocate outputs
-  x_grad = at::empty_like(x);
-  scale_grad = at::empty_like(scale);
-  bias_grad = at::empty_like(scale);
-  std::vector<void*> void_peer_buffers;
-  for (int64_t addr : peer_buffers) {
-    void_peer_buffers.push_back((void*)addr);
-  }
-  // we need the peer size for the buffer reset
-  size_t peer_size = 1;
-  for (size_t i = 0; i < 4; ++i){
-    peer_size *= peerDims[i];
-  }
-  assert(bn_group == void_peer_buffers.size());
-  std::vector<int64_t> fv = {(int64_t)BN_BWD, N, C, H, W, bn_group, (int64_t)CUDNN_DATA_HALF};
-  if ( gbn_plan_cache.find(fv) == gbn_plan_cache.end() ) {
-    auto plan = run_batch_norm_backward(tensorDims, perChannelDims, epsilonDims, peerDims, CUDNN_DATA_HALF);
-    gbn_plan_cache.emplace(fv, std::move(plan));
-  }
-  // get plan and handle
-  auto plan = gbn_plan_cache.find(fv)->second;
-  // execute
-  execute_batch_norm_backward(plan,
-			      x.data_ptr(),
-			      dy.data_ptr(),
-			      scale.data_ptr(),
-			      minibatch_mean.data_ptr(),
-			      minibatch_inv_var.data_ptr(),
-			      void_peer_buffers,
-			      x_grad.data_ptr(),
-			      scale_grad.data_ptr(),
-			      bias_grad.data_ptr(),
-			      static_cast<double>(epsilon),
-			      peer_size,
-			      rank_id);
-  return std::vector<at::Tensor>{x_grad, scale_grad, bias_grad};
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &gbn_forward, "Group batch norm forward");
-  m.def("backward", &gbn_backward, "Group batch backward");
-}

apex/apex/contrib/csrc/cudnn_gbn/norm_sample.cpp DELETED Viewed

@@ -1,479 +0,0 @@
-/*
-* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-* DEALINGS IN THE SOFTWARE.
-*/
-#include "norm_sample.h"
-#include <cudnn_frontend.h>
-#include "cudnn_backend.h"
-#include <ATen/cudnn/Handle.h>  // for getcudnnhandle
-#include <torch/extension.h>
-#include <torch/torch.h>
-// some helpers
-int64_t checkCudaError(cudaError_t code, const char* expr, const char* file, int line) {
-    if (code) {
-        printf("CUDA error at %s:%d, code=%d (%s) in '%s'", file, line, (int)code, cudaGetErrorString(code), expr);
-	return 1;
-    }
-    return 0;
-}
-int64_t checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
-    if (code) {
-        printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
-        return 1;
-    }
-    return 0;
-}
-bool
-AllowAll(cudnnBackendDescriptor_t engine_config) {
-  (void)engine_config;
-  return false;
-}
-void generateStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat) {
-  // For INT8x4 and INT8x32 we still compute standard strides here to input
-  // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
-  if (filterFormat == CUDNN_TENSOR_NCHW) {
-    strideA[nbDims - 1] = 1;
-    for (int64_t d = nbDims - 2; d >= 0; d--) {
-      strideA[d] = strideA[d + 1] * dimA[d + 1];
-    }
-  } else {
-    // Here we assume that the format is CUDNN_TENSOR_NHWC
-    strideA[1]          = 1;
-    strideA[nbDims - 1] = strideA[1] * dimA[1];
-    for (int64_t d = nbDims - 2; d >= 2; d--) {
-      strideA[d] = strideA[d + 1] * dimA[d + 1];
-    }
-    strideA[0] = strideA[2] * dimA[2];
-  }
-}
-// runtime
-cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims,
-						     int64_t *perChannelSum,
-						     int64_t *epsilon,
-						     int64_t *peerDims,
-						     cudnnDataType_t data_type) {
-  // get the cudnn handle
-  cudnnHandle_t handle = torch::native::getCudnnHandle();
-  // Creates the necessary tensor descriptors
-  int64_t tensor_stride[4];
-  int64_t stride[4];
-  int64_t peer_stride[4];
-  // NHWC format. GenerateStrides() takes care of this. Howeever, tensor dims should still be NCHW
-  generateStrides(tensorDims, tensor_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
-  generateStrides(peerDims, peer_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
-  auto tensor_create = [&tensor_stride, &tensorDims](cudnnDataType_t type,
-  int64_t id) {
-    return cudnn_frontend::TensorBuilder()
-      .setDim(4, tensorDims)
-        .setStrides(4, tensor_stride)
-          .setId(id)
-            .setAlignment(16)
-              .setDataType(type)
-                .build();
-  };
-  auto peer_tensor_create = [&peer_stride, &tensorDims](cudnnDataType_t type,
-  int64_t id) {
-    return cudnn_frontend::TensorBuilder()
-      .setDim(4, tensorDims)
-        .setStrides(4, peer_stride)
-          .setId(id)
-            .setAlignment(16)
-              .setDataType(type)
-                .build();
-  };
-  generateStrides(perChannelSum, stride, (int64_t)4, CUDNN_TENSOR_NHWC);
-  auto per_channel_tensor_create = [&stride, &perChannelSum](cudnnDataType_t type, int64_t id) {
-    return cudnn_frontend::TensorBuilder()
-      .setDim(4, perChannelSum)
-        .setStrides(4, stride)
-          .setId(id)
-            .setAlignment(16)
-              .setDataType(type)
-                .build();
-  };
-  auto xTensor             = tensor_create(data_type, 100);
-  auto yTensor             = tensor_create(data_type, 101);
-  auto scaleTensor         = per_channel_tensor_create(CUDNN_DATA_FLOAT, 102);
-  auto biasTensor          = per_channel_tensor_create(CUDNN_DATA_FLOAT, 103);
-  auto inMeanTensor        = per_channel_tensor_create(CUDNN_DATA_FLOAT, 104);
-  auto inVarTensor         = per_channel_tensor_create(CUDNN_DATA_FLOAT, 105);
-  auto outMeanTensor       = per_channel_tensor_create(CUDNN_DATA_FLOAT, 106);
-  auto outVarTensor        = per_channel_tensor_create(CUDNN_DATA_FLOAT, 107);
-  auto savedMeanTensor     = per_channel_tensor_create(CUDNN_DATA_FLOAT, 108);
-  auto savedInvVarTensor   = per_channel_tensor_create(CUDNN_DATA_FLOAT, 109);
-  int64_t epsilon_stride[4];
-  generateStrides(epsilon, epsilon_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
-  auto scalar_tensor_create = [&epsilon_stride, &epsilon](cudnnDataType_t type, int64_t id) {
-    return cudnn_frontend::TensorBuilder()
-      .setDim(4, epsilon)
-        .setStrides(4, epsilon_stride)
-          .setId(id)
-            .setAlignment(16)
-              .setDataType(type)
-                .setByValue(true)
-                  .build();
-  };
-  auto epsilonTensor       = scalar_tensor_create(CUDNN_DATA_DOUBLE, 110);
-  auto expDecayTensor      = scalar_tensor_create(CUDNN_DATA_DOUBLE, 111);
-  // Create the two peer stat tensors. Jump IDs in case we need to add more tensors with UIDs
-  std::vector<cudnn_frontend::Tensor_v8> peerStatTensors;
-  for (size_t i = 112; i < 112 + peerDims[0]; ++i) {
-    peerStatTensors.push_back(peer_tensor_create(CUDNN_DATA_FLOAT, i));
-  }
-#if (CUDNN_VERSION >= 8500)
-  // Batch normalization
-  cudnnBackendNormMode_t normalizationMode = CUDNN_BATCH_NORM;
-  // Forward training
-  cudnnBackendNormFwdPhase_t phase = CUDNN_NORM_FWD_TRAINING;
-  //Create a Finalize node
-  auto batch_norm_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR)
-    .setNormalizationMode(normalizationMode)
-    .setNormFwdPhase(phase)
-    .setxDesc(xTensor)
-    .setScaleAndBias(scaleTensor, biasTensor)
-    .setPrevRunningMeanAndVar(inMeanTensor, inVarTensor)
-    .setNextRunningMeanAndVar(outMeanTensor, outVarTensor)
-    .setSavedMeanAndInvVar(savedMeanTensor, savedInvVarTensor)
-    .setEpsilonTensor(epsilonTensor)
-    .setExpDecayFactorTensor(expDecayTensor)
-    .setPeerStatTensor(peerStatTensors)
-    .setyDesc(yTensor)
-    .build();
-  std::array<cudnn_frontend::Operation const*, 1> ops = {&batch_norm_op};
-#else
-  std::array<cudnn_frontend::Operation const*, 0> ops = {};
-#endif
-  auto opGraph = cudnn_frontend::OperationGraphBuilder().setHandle(handle).setOperationGraph(ops.size(), ops.data()).build();
-  //std::cout << opGraph.describe() << std::endl;
-  cudnn_frontend::EngineConfigList filtered_configs;
-  auto statuses =
-    cudnn_frontend::get_heuristics_list<2>({"heuristics_instant"
-      , "heuristics_fallback"
-    }, opGraph,::AllowAll, filtered_configs, true);
-  //std::cout << "get_heuristics_list Statuses: ";
-  //for (auto i = 0u ; i < statuses.size(); i++) {
-  //  std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
-  //}
-  //std::cout << std::endl;
-  //std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
-  // some verbose printing:
-  //std::cout << "Tensor shape: (" << tensorDims[0] << ", " << tensorDims[1] << ", " << tensorDims[2] << ", " << tensorDims[3] << ")" << std::endl;
-  auto plan_builder = [&filtered_configs, &opGraph, &handle]() {
-    for (auto i = 0u; i < filtered_configs.size(); i++) {
-      try {
-        auto plan = cudnn_frontend::ExecutionPlanBuilder().setHandle(handle).setEngineConfig(filtered_configs[i], opGraph.getTag()).build();
-        return plan;
-      } catch (cudnn_frontend::cudnnException &e) {
-        continue;
-      }
-    }
-    return cudnn_frontend::ExecutionPlanBuilder().setHandle(handle).setEngineConfig(filtered_configs[0], opGraph.getTag()).build();
-  };
-  assert(filtered_configs.size() > 0);
-  auto plan = plan_builder();
-  return plan;
-}
-void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan,
-				void *xDevPtr,
-				void *yDevPtr,
-				void *scaledevPtr,
-				void *biasdevPtr,
-				void *in_meandevPtr,
-				void *in_vardevPtr,
-				void *out_meandevPtr,
-				void *out_vardevPtr,
-				void *saved_meandevPtr,
-				void *saved_inv_vardevPtr,
-				const std::vector<void*> &peer_devPtrs,
-				double epsilon_val,
-				double exponential_decay_factor,
-				size_t peer_size,
-				int rank_id) {
-  // get handle
-  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-  // get stream
-  cudaStream_t stream;
-  cudnnGetStream(handle_, &stream);
-  try {
-    // allocate workspace
-    auto workspace_size = plan.getWorkspaceSize();
-    auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-    void* workPtr = nullptr;
-    if (workspace_size > 0) {
-      workPtr = workspace_tensor.data_ptr<float>();
-    }
-    // first the data pointers
-    std::vector<void*> data_ptrs {xDevPtr, yDevPtr, scaledevPtr, biasdevPtr,
-	in_meandevPtr, in_vardevPtr, out_meandevPtr, out_vardevPtr,
-	saved_meandevPtr, saved_inv_vardevPtr,
-	&epsilon_val, &exponential_decay_factor};
-    data_ptrs.insert(data_ptrs.end(), peer_devPtrs.begin(), peer_devPtrs.end());
-    // then the uids
-    std::vector<int64_t> uids;
-    for (size_t i = 100; i < 100 + data_ptrs.size(); ++i) {
-      uids.push_back(i);
-    }
-    auto variantPack  = cudnn_frontend::VariantPackBuilder()
-      .setWorkspacePointer(workPtr)
-      .setDataPointers(data_ptrs.size(), data_ptrs.data())
-      .setUids(uids.size(), uids.data())
-      .build();
-    //std::cout << "variantPack " << variantPack.describe() << std::endl;
-    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    // Reset local communication buffer
-    cudaMemsetAsync(peer_devPtrs[rank_id], 0, peer_size*4, stream);
-  } catch (cudnn_frontend::cudnnException &e) {
-    struct cudaDeviceProp prop;
-    checkCudaErr(cudaGetDeviceProperties(&prop, 0));
-    if (prop.major == 8) {
-      std::cout << "[ERROR] Exception " << e.what() << std::endl;
-      assert(false);
-    }
-  }
-}
-cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims,
-						      int64_t *perChannelSum,
-						      int64_t *epsilon,
-						      int64_t *peerDims,
-						      cudnnDataType_t data_type) {
-  // get cudnn handle
-  cudnnHandle_t handle = torch::native::getCudnnHandle();
-  // Creates the necessary tensor descriptors
-  int64_t tensor_stride[4];
-  int64_t stride[4];
-  int64_t peer_stride[4];
-  // NHWC format. GenerateStrides() takes care of this. Howeever, tensor dims should still be NCHW
-  generateStrides(tensorDims, tensor_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
-  generateStrides(peerDims, peer_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
-  auto tensor_create = [&tensor_stride, &tensorDims](cudnnDataType_t type, int64_t id) {
-      return cudnn_frontend::TensorBuilder()
-        .setDim(4, tensorDims)
-          .setStrides(4, tensor_stride)
-            .setId(id)
-              .setAlignment(16)
-                .setDataType(type)
-                  .build();
-  };
-  auto peer_tensor_create = [&peer_stride, &peerDims](cudnnDataType_t type, int64_t id) {
-      return cudnn_frontend::TensorBuilder()
-        .setDim(4, peerDims)
-          .setStrides(4, peer_stride)
-            .setId(id)
-              .setAlignment(16)
-                .setDataType(type)
-                  .build();
-  };
-  generateStrides(perChannelSum, stride, (int64_t)4, CUDNN_TENSOR_NHWC);
-  auto per_channel_tensor_create = [&stride, &perChannelSum](cudnnDataType_t type, int64_t id) {
-      return cudnn_frontend::TensorBuilder()
-        .setDim(4, perChannelSum)
-          .setStrides(4, stride)
-            .setId(id)
-              .setAlignment(16)
-                .setDataType(type)
-                  .build();
-  };
-  auto xTensor             = tensor_create(data_type, 100);
-  auto dyTensor            = tensor_create(data_type, 101);
-  auto scaleTensor         = per_channel_tensor_create(CUDNN_DATA_FLOAT, 102);
-  auto savedMeanTensor     = per_channel_tensor_create(CUDNN_DATA_FLOAT, 103);
-  auto savedInvVarTensor   = per_channel_tensor_create(CUDNN_DATA_FLOAT, 104);
-  auto dxTensor            = tensor_create(data_type, 105);
-  auto dScaleTensor        = per_channel_tensor_create(CUDNN_DATA_FLOAT, 106);
-  auto dBiasTensor         = per_channel_tensor_create(CUDNN_DATA_FLOAT, 107);
-  int64_t epsilon_stride[4];
-  generateStrides(epsilon, epsilon_stride, (int64_t)4, CUDNN_TENSOR_NHWC);
-  auto scalar_tensor_create = [&epsilon_stride, &epsilon](cudnnDataType_t type, int64_t id) {
-      return cudnn_frontend::TensorBuilder()
-        .setDim(4, epsilon)
-          .setStrides(4, epsilon_stride)
-            .setId(id)
-              .setAlignment(16)
-                .setDataType(type)
-                  .setByValue(true)
-                    .build();
-  };
-  auto epsilonTensor       = scalar_tensor_create(CUDNN_DATA_DOUBLE, 108);
-  std::vector<cudnn_frontend::Tensor_v8> peerStatTensors;
-  for (size_t i = 109; i < 109 + peerDims[0]; ++i) {
-    peerStatTensors.push_back(peer_tensor_create(CUDNN_DATA_FLOAT, i));
-  }
-#if (CUDNN_VERSION >= 8500)
-  // Batch normalization
-  cudnnBackendNormMode_t normalizationMode = CUDNN_BATCH_NORM;
-  //Create a Finalize node
-  auto batch_norm_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR)
-    .setNormalizationMode(normalizationMode)
-    .setxDesc(xTensor)
-    .setSavedMeanAndInvVar(savedMeanTensor, savedInvVarTensor)
-    .setdyDesc(dyTensor)
-    .setScale(scaleTensor)
-    .setEpsilonTensor(epsilonTensor)
-    .setDScaleAndDBias(dScaleTensor, dBiasTensor)
-    .setdxDesc(dxTensor)
-    .setPeerStatTensor(peerStatTensors)
-    .build();
-  std::array<cudnn_frontend::Operation const*, 1> ops = {&batch_norm_op};
-#else
-  std::array<cudnn_frontend::Operation const*, 0> ops = {};
-#endif
-  auto opGraph = cudnn_frontend::OperationGraphBuilder().setHandle(handle).setOperationGraph(ops.size(), ops.data()).build();
-  //std::cout << opGraph.describe() << std::endl;
-  cudnn_frontend::EngineConfigList filtered_configs;
-  auto statuses =
-    cudnn_frontend::get_heuristics_list<2>({"heuristics_instant"
-					    , "heuristics_fallback"
-      }, opGraph,::AllowAll, filtered_configs, true);
-  auto plan_builder = [&filtered_configs, &opGraph, &handle]() {
-    for (auto i = 0u; i < filtered_configs.size(); i++) {
-      try {
-        auto plan = cudnn_frontend::ExecutionPlanBuilder().setHandle(handle).setEngineConfig(filtered_configs[i], opGraph.getTag()).build();
-        return plan;
-      } catch (cudnn_frontend::cudnnException &e) {
-        continue;
-      }
-    }
-    return cudnn_frontend::ExecutionPlanBuilder().setHandle(handle).setEngineConfig(filtered_configs[0], opGraph.getTag()).build();
-  };
-  assert(filtered_configs.size() > 0);
-  auto plan = plan_builder();
-  return plan;
-}
-void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan,
-				 void *xDevPtr,
-				 void *dyDevPtr,
-				 void *scaledevPtr,
-				 void *saved_meandevPtr,
-				 void *saved_inv_vardevPtr,
-				 const std::vector<void*> &peer_devPtrs,
-				 void *dxDevPtr,
-				 void *dscaledevPtr,
-				 void *dbiasdevPtr,
-				 double epsilon_val,
-				 size_t peer_size,
-				 int rank_id) {
-  // get handle
-  cudnnHandle_t handle_ = torch::native::getCudnnHandle();
-  // get stream
-  cudaStream_t stream;
-  cudnnGetStream(handle_, &stream);
-  try {
-    // allocate workspace
-    auto workspace_size = plan.getWorkspaceSize();
-    auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
-    void* workPtr = nullptr;
-    if (workspace_size > 0) {
-      workPtr = workspace_tensor.data_ptr<float>();
-    }
-    // create helper arrays
-    std::vector<void*> data_ptrs {xDevPtr, dyDevPtr, scaledevPtr,
-	saved_meandevPtr, saved_inv_vardevPtr,
-	dxDevPtr, dscaledevPtr, dbiasdevPtr, &epsilon_val};
-    data_ptrs.insert(data_ptrs.end(), peer_devPtrs.begin(), peer_devPtrs.end());
-    std::vector<int64_t> uids;
-    for (size_t i = 100; i < 100 + data_ptrs.size(); ++i) {
-      uids.push_back(i);
-    }
-    auto variantPack  = cudnn_frontend::VariantPackBuilder()
-      .setWorkspacePointer(workPtr)
-      .setDataPointers(data_ptrs.size(), data_ptrs.data())
-      .setUids(uids.size(), uids.data())
-      .build();
-    cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
-    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
-    // Reset local communication buffer
-    cudaMemsetAsync(peer_devPtrs[rank_id], 0, peer_size*4, stream);
-  } catch (cudnn_frontend::cudnnException &e) {
-    struct cudaDeviceProp prop;
-    checkCudaErr(cudaGetDeviceProperties(&prop, 0));
-    if (prop.major == 8) {
-      std::cout << "[ERROR] Exception " << e.what() << std::endl;
-      assert(false);
-    }
-  }
-}

apex/apex/contrib/csrc/cudnn_gbn/norm_sample.h DELETED Viewed

@@ -1,153 +0,0 @@
-#pragma once
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-#pragma once
-#include <iostream>
-#include <inttypes.h>
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <assert.h>
-#include <tuple>
-#include <functional>
-#include <cudnn.h>
-#include <cudnn_frontend.h>
-/* some helpers
- */
-void generateStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat);
-int64_t checkCudaError(cudaError_t code, const char* expr, const char* file, int line);
-int64_t checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line);
-#define checkCudaErr(...)                                                        \
-    do {                                                                         \
-        int64_t err = checkCudaError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
-        assert(err == 0);                                                       \
-    } while (0)
-#define checkCudnnErr(...)                                                        \
-    do {                                                                          \
-        int64_t err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
-        assert(err == 0);                                                        \
-    } while (0)
-/**
- * @brief Run a Group BN forward sample with 2 peer stat tensors.
- *
- * @param tensorDims an array with shape (N, C, H, W) for input tensor dims. Stride in NHWC or NCHW will take care of memory format
- * @param perChannelSum an array with shape (1, C, 1, 1) to denote the sum values for each channel in the input tensor
- * @param epsilon a scalar array with shape (1, 1, 1, 1) to represent the epsilon value for the BN
- * @param peerDims an array with shape (num GPUs, 2 * C, 1, 1) to denote the tensor dimensions for peer stat tensor in GBN
- *
- */
-cudnn_frontend::ExecutionPlan run_batch_norm_forward(
-						     int64_t *tensorDims,
-						     int64_t *perChannelSum,
-						     int64_t *epsilon,
-						     int64_t *peerDims,
-						     cudnnDataType_t in_out_data_type);
-/**
- * @param xDevPtr input tensor device pointer
- * @param yDevPtr output tensor device pointer
- * @param scaledevPtr input scale device pointer for BN scaling
- * @param biasdevPtr input scale device pointer for BN bias
- * @param in_meandevPtr Input mean device pointer
- * @param in_vardevPtr Input variance device pointer
- * @param out_meandevPtr output mean device pointer
- * @param out_vardevPtr output variance device pointer
- * @param saved_meandevPtr saved mean device pointer for BN backward
- * @param saved_inv_vardevPtr saved inverse variance device pointer for BN backward
- * @param peer_devPtr1 peer stat tensor 1 device pointer
- * @param peer_devPtr2 peer stat tensor 2 device pointer
- * @param epsilon_val episilon value as a double
- * @param exponential_decay_factor exponential_decay_factor as a value
- *
-**/
-void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan,
-				void *xDevPtr,
-				void *yDevPtr,
-				void *scaledevPtr,
-				void *biasdevPtr,
-				void *in_meandevPtr,
-				void *in_vardevPtr,
-				void *out_meandevPtr,
-				void *out_vardevPtr,
-				void *saved_meandevPtr,
-				void *saved_inv_vardevPtr,
-				const std::vector<void*> &peer_devPtrs,
-				double epsilon_val,
-				double exponential_decay_factor,
-				size_t peer_size,
-				int rank_id);
-/**
- * @brief Run a Group BN backward sample with 2 peer stat tensors.
- *
- * @param tensorDims an array with shape (N, C, H, W) for input tensor dims. Stride in NHWC or NCHW will take care of memory format
- * @param perChannelSum an array with shape (1, C, 1, 1) to denote the sum values for each channel in the input tensor
- * @param epsilon a scalar array with shape (1, 1, 1, 1) to represent the epsilon value for the BN
- * @param peerDims an array with shape (num GPUs, 2 * C, 1, 1) to denote the tensor dimensions for peer stat tensor in GBN
-    *
-*/
-cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims,
-						      int64_t *perChannelSum,
-						      int64_t *epsilon,
-						      int64_t *peerDims,
-						      cudnnDataType_t data_type);
-/**
- * @brief Run a Group BN backward sample with 2 peer stat tensors.
- *
- * @param xDevPtr input tensor device pointer
- * @param yDevPtr output tensor device pointer
- * @param scaledevPtr input scale device pointer for BN scaling
- * @param biasdevPtr input scale device pointer for BN bias
- * @param in_meandevPtr Input mean device pointer
- * @param in_vardevPtr Input variance device pointer
- * @param out_meandevPtr output mean device pointer
- * @param out_vardevPtr output variance device pointer
- * @param saved_meandevPtr saved mean device pointer for BN backward
- * @param saved_inv_vardevPtr saved inverse variance device pointer for BN backward
- * @param peer_devPtr1 peer stat tensor 1 device pointer
- * @param peer_devPtr2 peer stat tensor 2 device pointer
- * @param epsilon_val episilon value as a double
- *
- */
-void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan,
-				 void *xDevPtr,
-				 void *dyDevPtr,
-				 void *scaledevPtr,
-				 void *saved_meandevPtr,
-				 void *saved_inv_vardevPtr,
-				 const std::vector<void*> &peer_devPtrs,
-				 void *dxDevPtr,
-				 void *dscaledevPtr,
-				 void *dbiasdevPtr,
-				 double epsilon_val,
-				 size_t peer_size,
-				 int rank_id);

apex/apex/contrib/csrc/fmha/fmha_api.cpp DELETED Viewed

@@ -1,365 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#include <torch/extension.h>
-#include <ATen/cuda/CUDAContext.h>
-#include "fmha.h"
-extern at::Tensor & mha_fill(at::Tensor &self, const at::Tensor &start_index);
-void set_params(Fused_multihead_attention_fprop_params &params,
-                // sizes
-                const size_t b,
-                const size_t s,
-                const size_t h,
-                const size_t d,
-                // device pointers
-                void *qkv_packed_d,
-                void *cu_seqlens_d,
-                void *o_packed_d,
-                void *s_d,
-                float p_dropout) {
-    Data_type acc_type = DATA_TYPE_FP32;
-    Data_type data_type = DATA_TYPE_FP16;
-    // Reset the parameters
-    memset(&params, 0, sizeof(params));
-    // Set the pointers and strides.
-    params.qkv_ptr = qkv_packed_d;
-    params.qkv_stride_in_bytes = get_size_in_bytes(h * 3 * d, data_type);
-    params.o_ptr = o_packed_d;
-    params.o_stride_in_bytes = get_size_in_bytes(h * d, data_type);
-    params.cu_seqlens = static_cast<int *>(cu_seqlens_d);
-    // S = softmax(P)
-    params.s_ptr = s_d;
-    params.s_stride_in_bytes = get_size_in_bytes(b * h * s, data_type);
-    // Set the dimensions.
-    params.b = b;
-    params.h = h;
-    params.s = s;
-    params.d = d;
-    // Set the different scale values.
-    const float scale_bmm1 = 1.f / sqrtf(d);
-    constexpr float scale_softmax = 1.f;
-    constexpr float scale_bmm2 = 1.f;
-    set_alpha(params.scale_bmm1, scale_bmm1, data_type);
-    set_alpha(params.scale_softmax, scale_softmax, acc_type);
-    set_alpha(params.scale_bmm2, scale_bmm2, data_type);
-    // Set this to probability of keeping an element to simplify things.
-    params.p_dropout = 1.f - p_dropout;
-    params.rp_dropout = 1.f / params.p_dropout;
-    TORCH_CHECK(p_dropout < 1.f);
-    set_alpha(params.scale_dropout, params.rp_dropout, data_type);
-}
-std::vector<at::Tensor>
-mha_fwd(const at::Tensor &qkv,         // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
-        const at::Tensor &cu_seqlens,  // b+1
-        const float p_dropout,
-        const int max_seq_len,
-        const bool is_training,
-        const bool is_nl,
-        const bool zero_tensors,
-        c10::optional<at::Generator> gen_) {
-    using namespace torch::indexing;
-    auto dprops = at::cuda::getCurrentDeviceProperties();
-    TORCH_CHECK((dprops->major == 8 && dprops->minor == 0) ||
-                (dprops->major == 9 && dprops->minor == 0));
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    Launch_params<Fused_multihead_attention_fprop_params> launch_params(dprops, stream, is_training, is_nl);
-    int seq_len = 512;
-    auto launch = &run_fmha_fp16_512_64_sm80;
-    if( max_seq_len <= 128 ) {
-        seq_len = 128;
-        launch = &run_fmha_fp16_128_64_sm80;
-    } else if( max_seq_len <= 256 ) {
-        seq_len = 256;
-        launch = &run_fmha_fp16_256_64_sm80;
-    } else if( max_seq_len <= 384 ) {
-        seq_len = 384;
-        launch = &run_fmha_fp16_384_64_sm80;
-    } else if( max_seq_len <= 512 ) {
-        seq_len = 512;
-        launch = &run_fmha_fp16_512_64_sm80;
-    } else {
-        TORCH_CHECK(false);
-    }
-    TORCH_CHECK(qkv.is_cuda())
-    TORCH_CHECK(cu_seqlens.is_cuda())
-    TORCH_CHECK(qkv.is_contiguous())
-    TORCH_CHECK(cu_seqlens.is_contiguous())
-    TORCH_CHECK(cu_seqlens.dim() == 1);
-    TORCH_CHECK(qkv.dim() == 4);
-    const auto sizes = qkv.sizes();
-    TORCH_CHECK(sizes[THREE_DIM] == 3);
-    const int batch_size = cu_seqlens.numel() - 1;
-    const int total = sizes[TOTAL_DIM];
-    const int num_heads = sizes[H_DIM];
-    const int head_size = sizes[D_DIM];
-    TORCH_CHECK(batch_size > 0);
-    TORCH_CHECK(head_size == 64);
-    auto opts = qkv.options();
-    auto ctx = torch::empty({ total, num_heads, head_size }, opts);
-    auto s = torch::empty({ batch_size, num_heads, seq_len, seq_len }, opts);
-    if( zero_tensors ) {
-        mha_fill(ctx, cu_seqlens.index({Slice(-1,None)}));
-    }
-    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
-        gen_, at::cuda::detail::getDefaultCUDAGenerator());
-    set_params(launch_params.params,
-               batch_size,
-               seq_len,
-               num_heads,
-               head_size,
-               qkv.data_ptr(),
-               cu_seqlens.data_ptr(),
-               ctx.data_ptr(),
-               s.data_ptr(),
-               p_dropout);
-    launch(launch_params, /*configure=*/ true);
-    // number of times random will be generated per thread, to offset philox counter in thc random
-    // state
-    int64_t counter_offset = launch_params.elts_per_thread;
-    at::PhiloxCudaState rng_engine_inputs;
-    if( is_training ) {
-        // See Note [Acquire lock when using random generators]
-        std::lock_guard<std::mutex> lock(gen->mutex_);
-        launch_params.params.philox_args = gen->philox_cuda_state(counter_offset);
-    }
-    launch(launch_params, /*configure=*/ false);
-    return { ctx, s };
-}
-std::vector<at::Tensor>
-mha_bwd(const at::Tensor &dout,  // total x num_heads, x head_size
-        const at::Tensor &qkv,   // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
-        at::Tensor &softmax,     // b x h x s x s softmax and dmask - will be overwritten with dP
-        const at::Tensor &cu_seqlens,  // b+1
-        const float p_dropout,         // probability to drop
-        const int max_seq_len,          // max sequence length to choose the kernel
-        const bool zero_tensors
-) {
-    using namespace torch::indexing;
-    auto dprops = at::cuda::getCurrentDeviceProperties();
-    TORCH_CHECK((dprops->major == 8 && dprops->minor == 0) ||
-                (dprops->major == 9 && dprops->minor == 0));
-    int seq_len = 512;
-    auto launch = &run_fmha_dgrad_fp16_512_64_sm80;
-    if( max_seq_len <= 128 ) {
-        seq_len = 128;
-        launch = &run_fmha_dgrad_fp16_128_64_sm80;
-    } else if( max_seq_len <= 256 ) {
-        seq_len = 256;
-        launch = &run_fmha_dgrad_fp16_256_64_sm80;
-    } else if( max_seq_len <= 384 ) {
-        seq_len = 384;
-        launch = &run_fmha_dgrad_fp16_384_64_sm80;
-    } else if( max_seq_len <= 512 ) {
-        seq_len = 512;
-        launch = &run_fmha_dgrad_fp16_512_64_sm80;
-    } else {
-        TORCH_CHECK(false);
-    }
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    TORCH_CHECK(qkv.dtype() == torch::kFloat16);
-    TORCH_CHECK(dout.dtype() == torch::kFloat16);
-    TORCH_CHECK(softmax.dtype() == torch::kFloat16);
-    TORCH_CHECK(cu_seqlens.dtype() == torch::kInt32);
-    TORCH_CHECK(qkv.is_cuda());
-    TORCH_CHECK(cu_seqlens.is_cuda());
-    TORCH_CHECK(qkv.is_contiguous());
-    TORCH_CHECK(cu_seqlens.is_contiguous());
-    TORCH_CHECK(cu_seqlens.dim() == 1);
-    TORCH_CHECK(qkv.dim() == 4);
-    const auto sizes = qkv.sizes();
-    TORCH_CHECK(sizes[THREE_DIM] == 3);
-    const int batch_size = cu_seqlens.numel() - 1;
-    const int num_heads = sizes[H_DIM];
-    const int head_size = sizes[D_DIM];
-    TORCH_CHECK(batch_size > 0);
-    TORCH_CHECK(head_size == 64);
-    auto dqkv = torch::empty_like(qkv);
-    if( zero_tensors ) {
-        mha_fill(dqkv, cu_seqlens.index({Slice(-1,None)}));
-    }
-    Fused_multihead_attention_fprop_params params;
-    set_params(params,
-               batch_size,
-               seq_len,
-               num_heads,
-               head_size,
-               qkv.data_ptr(),
-               cu_seqlens.data_ptr(),
-               dout.data_ptr(),     // we set o_ptr to dout
-               softmax.data_ptr(),  // softmax gets overwritten by dP!
-               p_dropout);
-    // we're re-using these scales
-    Data_type acc_type = DATA_TYPE_FP32;
-    set_alpha(params.scale_bmm1, 1.f, acc_type);
-    set_alpha(params.scale_softmax, 1.f / sqrtf(head_size), acc_type);
-    set_alpha(params.scale_bmm2, 1.f, DATA_TYPE_FP16);
-    params.dqkv_ptr = dqkv.data_ptr();
-    launch(params, stream);
-    return { dqkv, softmax };
-}
-std::vector<at::Tensor> mha_bwd_nl(const at::Tensor &dout,        // total x num_heads, x head_size
-                                const at::Tensor &qkv,         // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
-                                at::Tensor &softmax,           // b x h x s x s softmax and dmask - will be overwritten with dP
-                                const at::Tensor &cu_seqlens,  // b+1
-                                const float p_dropout,         // probability to drop
-                                const int max_seq_len,          // max sequence length to choose the kernel
-                                const bool zero_tensors
-) {
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    TORCH_CHECK(qkv.is_cuda())
-    TORCH_CHECK(cu_seqlens.is_cuda())
-    TORCH_CHECK(qkv.is_contiguous())
-    TORCH_CHECK(cu_seqlens.is_contiguous())
-    TORCH_CHECK(cu_seqlens.dim() == 1);
-    TORCH_CHECK(qkv.dim() == 4);
-    const auto sizes = qkv.sizes();
-    TORCH_CHECK(sizes[THREE_DIM] == 3);
-    const int batch_size = cu_seqlens.numel() - 1;
-    const int total = sizes[TOTAL_DIM];
-    const int num_heads = sizes[H_DIM];
-    const int head_size = sizes[D_DIM];
-    TORCH_CHECK(batch_size > 0);
-    TORCH_CHECK(head_size == 64);
-    int seq_len = 512;
-    auto launch = &run_fmha_dgrad_fp16_512_64_sm80_nl;
-    auto opts = qkv.options();
-    auto dqkv = torch::empty_like(qkv);
-    if( zero_tensors ) {
-        dqkv.zero_();
-    }
-    int num_chunks = 2;
-    if( batch_size == 1 ) {
-        num_chunks = 4;
-    }else if( batch_size == 2 ) {
-        num_chunks = 3;
-    }
-    auto dkv = torch::empty({total, num_chunks, 2, num_heads, head_size}, opts);
-    Fused_multihead_attention_fprop_params params;
-    set_params(params,
-               batch_size,
-               seq_len,
-               num_heads,
-               head_size,
-               qkv.data_ptr(),
-               cu_seqlens.data_ptr(),
-               dout.data_ptr(),     // o_ptr = dout
-               softmax.data_ptr(),  // softmax gets overwritten by dP!
-               p_dropout);
-    params.dkv_ptr = dkv.data_ptr();
-    Data_type acc_type = DATA_TYPE_FP32;
-    set_alpha(params.scale_bmm1, 1.f, acc_type);
-    set_alpha(params.scale_softmax, 1.f / sqrtf(head_size), acc_type);
-    set_alpha(params.scale_bmm2, 1.f, DATA_TYPE_FP16);
-    params.dqkv_ptr = dqkv.data_ptr();
-    launch(params, num_chunks, stream);
-    //SPLIT-K reduction of num_chunks dK, dV parts
-    // The equivalent of the following Pytorch code:
-    // using namespace torch::indexing;
-    // at::Tensor view_out = dqkv.index({Slice(), Slice(1, None, None)});
-    // torch::sum_out(view_out, dkv, 1);
-    const int hidden_size = num_heads * head_size;
-    fmha_run_noloop_reduce(
-        dqkv.data_ptr(), dkv.data_ptr(), cu_seqlens.data_ptr<int>(), hidden_size, batch_size, total, num_chunks, stream);
-    return { dqkv, softmax, dkv };
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.doc() = "Fused Multi-head Self-attention for BERT";
-    m.def("fwd", &mha_fwd, "Forward pass");
-    m.def("bwd", &mha_bwd, "Backward pass");
-    m.def("bwd_nl", &mha_bwd_nl, "Backward pass (small-batch)");
-}

apex/apex/contrib/csrc/fmha/src/fmha.h DELETED Viewed

@@ -1,163 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-#include <cuda.h>
-#include <vector>
-#ifdef OLD_GENERATOR_PATH
-#include <ATen/CUDAGeneratorImpl.h>
-#else
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#endif
-#include <ATen/cuda/CUDAGraphsUtils.cuh>
-#include <fmha_utils.h>
-constexpr int TOTAL_DIM = 0;
-constexpr int THREE_DIM = 1;
-constexpr int H_DIM = 2;
-constexpr int D_DIM = 3;
-////////////////////////////////////////////////////////////////////////////////////////////////////
-struct Qkv_params {
-    // The QKV matrices.
-    void * __restrict__ qkv_ptr;
-    // The stride between rows of the Q, K and V matrices.
-    size_t qkv_stride_in_bytes;
-    // The number of heads.
-    int h;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-struct Fused_multihead_attention_fprop_params : public Qkv_params {
-    // The dQKV matrices.
-    void * __restrict__ dqkv_ptr;
-    // Temporary for dKV.
-    void * __restrict__ dkv_ptr;
-    // The O matrix (output).
-    void * __restrict__ o_ptr;
-    // The stride between rows of O.
-    int64_t o_stride_in_bytes;
-    // The pointer to the S matrix, overwritten by the dP matrix (bwd).
-    void * __restrict__ s_ptr;
-    // The stride between rows of the S matrix.
-    int64_t s_stride_in_bytes;
-    // The dimensions.
-    int b, s, d;
-    // The scaling factors for the kernel.
-    uint32_t scale_bmm1, scale_softmax, scale_bmm2;
-    // array of length b+1 holding starting offset of each sequence.
-    int * __restrict__ cu_seqlens;
-    // The dropout probability (probability of keeping an activation).
-    float p_dropout;
-    // Scale factor of 1 / (1 - p_dropout).
-    float rp_dropout;
-    // Scale factor of 1 / (1 - p_dropout), in half2.
-    uint32_t scale_dropout;
-    // Random state.
-    at::PhiloxCudaState philox_args;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename Kernel_params>
-struct Launch_params{
-    Launch_params(cudaDeviceProp * props_,
-                  cudaStream_t stream_,
-                  bool is_training_,
-                  bool is_nl_)
-        : elts_per_thread(0)
-        , props(props_)
-        , stream(stream_)
-        , is_training(is_training_)
-        , is_nl(is_nl_) {
-    }
-    size_t elts_per_thread;
-    cudaDeviceProp * props;
-    cudaStream_t stream;
-    bool is_training;
-    Kernel_params params;
-    int num_full_heads;
-    int num_main_groups;
-    int heads_last_wave;
-    int main_steps;
-    int rest_steps;
-    bool is_nl;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-void run_fmha_fp16_128_64_sm80(Launch_params<Fused_multihead_attention_fprop_params> &launch_params, const bool configure);
-void run_fmha_fp16_256_64_sm80(Launch_params<Fused_multihead_attention_fprop_params> &launch_params, const bool configure);
-void run_fmha_fp16_384_64_sm80(Launch_params<Fused_multihead_attention_fprop_params> &launch_params, const bool configure);
-void run_fmha_fp16_512_64_sm80(Launch_params<Fused_multihead_attention_fprop_params> &launch_params, const bool configure);
-void run_fmha_dgrad_fp16_128_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
-void run_fmha_dgrad_fp16_256_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
-void run_fmha_dgrad_fp16_384_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
-void run_fmha_dgrad_fp16_512_64_sm80(const Fused_multihead_attention_fprop_params &params, cudaStream_t stream);
-void run_fmha_fp16_512_64_sm80_nl(const Fused_multihead_attention_fprop_params &params, const bool is_training, const int num_chunks, cudaStream_t stream);
-void run_fmha_dgrad_fp16_512_64_sm80_nl(const Fused_multihead_attention_fprop_params &params, const int num_chunks, cudaStream_t stream);
-void fmha_run_noloop_reduce(void *out,
-                            const void *in,
-                            const int *cu_seqlens,
-                            const int hidden_size,
-                            const int batch_size,
-                            const int total,
-                            const int num_chunks,
-                            cudaStream_t stream);

apex/apex/contrib/csrc/fmha/src/fmha/gemm.h DELETED Viewed

@@ -1,314 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-#include <fmha/utils.h>
-#define FMHA_DIV_UP(m, n) (((m) + (n)-1) / (n))
-namespace fmha {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename Data_type_, int NUM_ELTS_, int BITS_PER_ELT_, int ALIGNMENT_ >
-struct Fragment_base_ {
-    // The data type.
-    using Data_type = Data_type_;
-    // default input type
-    using Input_type_ = Data_type_;
-    // Does it store the array of elements.
-    enum { HAS_ELTS = BITS_PER_ELT_ >= 8 };
-    // The number of elements.
-    enum { NUM_ELTS = NUM_ELTS_ };
-    // The size of element in bits.
-    enum { BITS_PER_ELT = BITS_PER_ELT_ };
-    // The size of byte of a single register.
-    enum { BYTES_PER_REG = 4 };
-    // The size in bits.
-    enum { BITS_PER_REG = BYTES_PER_REG * 8 };
-    // The number of registers needed to store the fragment.
-    enum { NUM_REGS = Div_up<NUM_ELTS * BITS_PER_ELT, BITS_PER_REG>::VALUE };
-    // The size in bytes (as returned by sizeof(Fragment_base<>).
-    enum { SIZE_IN_BYTES = NUM_REGS * BYTES_PER_REG };
-    // The alignment.
-    enum { ALIGNMENT = ALIGNMENT_ > 0 ? ALIGNMENT_ : Min<NUM_REGS * BYTES_PER_REG, 16>::VALUE };
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-    // The type of the elements.
-    typename Data_type_,
-    // The number of elements.
-    int NUM_ELTS_,
-    // The alignment if you want to force a value -- use 0 otherwise.
-    int ALIGNMENT_ = 0,
-    // The base class.
-    typename Base_ = Fragment_base_<Data_type_, NUM_ELTS_, 8 * sizeof(Data_type_), ALIGNMENT_>
->
-struct alignas(static_cast<int>(Base_::ALIGNMENT)) Fragment : public Base_ {
-    // The size of a load/store.
-    enum { BYTES_PER_LOAD_STORE = Base_::NUM_REGS * sizeof(uint32_t) };
-    // Clear the fragment. Using PTX in that code seems to produce better SASS...
-    inline __device__ void clear() {
-        #pragma unroll
-        for( int ii = 0; ii < Base_::NUM_REGS; ++ii ) {
-            asm volatile("mov.u32 %0, 0; \n" : "=r"(this->reg(ii)) : );
-        }
-    }
-    // Immutable access to a register.
-    inline __device__ const uint32_t& reg(int ii) const {
-        return this->regs_[ii];
-    }
-    // Mutable access to a register.
-    inline __device__ uint32_t& reg(int ii) {
-        return this->regs_[ii];
-    }
-    uint32_t regs_[Base_::NUM_REGS];
-    // Immutable access to the elements.
-    inline __device__ const Data_type_& elt(int ii) const {
-        return reinterpret_cast<const Data_type_*>(&this->regs_[0])[ii];
-    }
-    // Mutable access to the elements.
-    inline __device__ Data_type_& elt(int ii) {
-        return reinterpret_cast<Data_type_*>(&this->regs_[0])[ii];
-    }
-    // Immutable access to the elements with a cast.
-    template< typename Cast_type >
-    inline __device__ const Cast_type& elt_as(int ii) const {
-        return reinterpret_cast<const Cast_type*>(&this->regs_[0])[ii];
-    }
-    // Mutable access to the elements.
-    template< typename Cast_type >
-    inline __device__ Cast_type& elt_as(int ii) {
-        return reinterpret_cast<Cast_type*>(&this->regs_[0])[ii];
-    }
-    // Add another fragment.
-    inline __device__ void add(const Fragment &other) {
-        #pragma unroll
-        for( int ii = 0; ii < NUM_ELTS_; ++ii ) {
-            this->elt(ii) += other.elt(ii);
-        }
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename Layout >
-struct Fragment_a : public Fragment<uint16_t, 8> {
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename Layout >
-struct Fragment_b : public Fragment<uint16_t, 8> {
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-struct Fragment_accumulator : public Fragment<float, 8> {
-    // The base class.
-    using Base = Fragment<float, 8>;
-    // Add two fragments.
-    template< typename Other_fragment_ >
-    inline __device__ void add(const Other_fragment_ &other) {
-        for( int ii = 0; ii < Base::NUM_ELTS; ++ii ) {
-            this->elt(ii) = this->elt(ii) + other.elt(ii);
-        }
-    }
-    // Do the HMMA.
-    template< typename Layout_a, typename Layout_b >
-    inline __device__ void mma(const Fragment_a<Layout_a> &a,
-                               const Fragment_b<Layout_b> &b) {
-        asm volatile( \
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
-            "    {%0, %1, %2, %3}, \n" \
-            "    {%4, %5, %6, %7}, \n" \
-            "    {%8, %9}, \n" \
-            "    {%0, %1, %2, %3}; \n" \
-                    : "+f"(  elt(0)), "+f"(  elt(1)), "+f"(  elt(2)), "+f"(  elt(3))
-                    :  "r"(a.reg(0)),  "r"(a.reg(1)),  "r"(a.reg(2)),  "r"(a.reg(3))
-                    ,  "r"(b.reg(0)),  "r"(b.reg(1)));
-        asm volatile( \
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 \n" \
-            "    {%0, %1, %2, %3}, \n" \
-            "    {%4, %5, %6, %7}, \n" \
-            "    {%8, %9}, \n" \
-            "    {%0, %1, %2, %3}; \n" \
-                    : "+f"(  elt(4)), "+f"(  elt(5)), "+f"(  elt(6)), "+f"(  elt(7))
-                    :  "r"(a.reg(0)),  "r"(a.reg(1)),  "r"(a.reg(2)),  "r"(a.reg(3))
-                    ,  "r"(b.reg(2)),  "r"(b.reg(3)));
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename Fragment, int M, int N >
-inline __device__ void clear(Fragment (&frag)[M][N]) {
-    #pragma unroll
-    for( int mi = 0; mi < M; ++mi ) {
-        #pragma unroll
-        for( int ni = 0; ni < N; ++ni ) {
-            frag[mi][ni].clear();
-        }
-    }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename Accumulator_type, int WARPS_K >
-struct Clear_accumulator {
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< int WARPS_K >
-struct Clear_accumulator<float, WARPS_K> {
-  template< typename Acc, int M, int N >
-  static inline __device__ void apply(Acc (&acc)[M][N], bool = false) {
-    fmha::clear(acc);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename Acc, typename A, typename B, int M, int N>
-inline __device__ void gemm(Acc (&acc)[M][N], const A (&a)[M], const B (&b)[N]) {
-    #pragma unroll
-    for( int mi = 0; mi < M; ++mi ) {
-        #pragma unroll
-        for( int ni = 0; ni < N; ++ni ) {
-            acc[mi][ni].mma(a[mi], b[ni]);
-        }
-    }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-    // The number of rows in the CTA tile.
-    int M_,
-    // The number of cols in the CTA tile.
-    int N_,
-    // The number of elements in the the K dimension of the GEMM loop.
-    int K_,
-    // The number of rows of warps.
-    int WARPS_M_,
-    // The number of cols of warps.
-    int WARPS_N_,
-    // The number of warps in the K dimension of the GEMM loop.
-    int WARPS_K_>
-struct Cta_tile_ {
-    enum { M = M_, N = N_, K = K_ };
-    // The number of warps.
-    enum { WARPS_M = WARPS_M_, WARPS_N = WARPS_N_, WARPS_K = WARPS_K_ };
-    // The number of warps per CTA.
-    enum { WARPS_PER_CTA = WARPS_M * WARPS_N * WARPS_K };
-    // The number of threads per warp.
-    enum { THREADS_PER_WARP = 32 };
-    // The number of threads per CTA.
-    enum { THREADS_PER_CTA = WARPS_PER_CTA * THREADS_PER_WARP };
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename Cta_tile>
-struct Hmma_tile {
-    // The number of elements computed with a single warp-MMA.
-    enum { M_PER_MMA = 16, N_PER_MMA = 16, K_PER_MMA = 16 };
-    // The number of elements computed with a single CTA-MMA.
-    enum {
-        M_PER_MMA_PER_CTA = M_PER_MMA * Cta_tile::WARPS_M,
-        N_PER_MMA_PER_CTA = N_PER_MMA * Cta_tile::WARPS_N,
-        K_PER_MMA_PER_CTA = K_PER_MMA * Cta_tile::WARPS_K
-    };
-    // The number of MMAs needed to compute the GEMM.
-    enum {
-        MMAS_M = Div_up<Cta_tile::M, M_PER_MMA_PER_CTA>::VALUE,
-        MMAS_N = Div_up<Cta_tile::N, N_PER_MMA_PER_CTA>::VALUE,
-        MMAS_K = Div_up<Cta_tile::K, K_PER_MMA_PER_CTA>::VALUE,
-    };
-    // The number of elements computed per warp.
-    enum {
-        M_PER_WARP = MMAS_M * M_PER_MMA,
-        N_PER_WARP = MMAS_N * N_PER_MMA,
-        K_PER_WARP = MMAS_K * K_PER_MMA,
-    };
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-using A_type = uint16_t;
-using B_type = uint16_t;
-using C_type = uint16_t;
-using Accumulator_type = float;
-using Epilogue_type = float;
-constexpr int BITS_PER_ELEMENT_A = sizeof(A_type) * 8;
-constexpr int BITS_PER_ELEMENT_B = sizeof(B_type) * 8;
-constexpr int BITS_PER_ELEMENT_C = sizeof(C_type) * 8;
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<int M, int N, int K, int WARPS_M, int WARPS_N, int WARPS_K>
-using Cta_tile_extd = Cta_tile_<M, N, K, WARPS_M, WARPS_N, WARPS_K>;
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename Cta_tile_>
-using Cta_tile_with_k_with_padding = Cta_tile_extd<Cta_tile_::M,
-                                                   Cta_tile_::N,
-                                                   Next_power_of_two<Cta_tile_::K>::VALUE,
-                                                   Cta_tile_::WARPS_M,
-                                                   Cta_tile_::WARPS_N,
-                                                   Cta_tile_::WARPS_K>;
-////////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace fmha

apex/apex/contrib/csrc/fmha/src/fmha/gmem_tile.h DELETED Viewed

@@ -1,456 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-namespace fmha {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-    // The dimensions of the tile computed by the CTA.
-    typename Cta_tile,
-    // The number of bits per element.
-    int BITS_PER_ELEMENT,
-    // The number of rows of Q, K or V loaded by this tile.
-    int ROWS,
-    // The number of columns.
-    int COLS,
-    // The number of matrics.
-    int NUM_MATS = 3
->
-struct Gmem_tile_qkv {
-    // The size of each LDG.
-    enum { BYTES_PER_LDG = 16 };
-    // The size of a row in bytes.
-    enum { BYTES_PER_ROW = COLS * BITS_PER_ELEMENT / 8 };
-    // The number of threads to load a "row" of the matrix.
-    enum { THREADS_PER_ROW = BYTES_PER_ROW / BYTES_PER_LDG };
-    // The number of "rows" loaded per LDG.
-    enum { ROWS_PER_LDG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
-    // The number of LDGs needed to load a chunk of the Q matrix.
-    enum { LDGS = fmha::Div_up<ROWS, ROWS_PER_LDG>::VALUE };
-    // Ctor.
-    template< typename Params, typename BInfo >
-    inline __device__ Gmem_tile_qkv(const Params &params, const int qkv_offset, const BInfo &binfo, const int tidx)
-        : params_qkv_stride_in_bytes_(params.qkv_stride_in_bytes)
-        , actual_seqlen(binfo.actual_seqlen)
-        , qkv_ptr_(reinterpret_cast<char *>(params.qkv_ptr)) {
-        // Compute the position in the sequence (within the CTA for the moment).
-        int row = tidx / THREADS_PER_ROW;
-        // Compute the position of the thread in the row.
-        int col = tidx % THREADS_PER_ROW;
-        // Store the row as we need it to disable the loads.
-        row_ = row;
-        // The row offset in the batched GEMM. For each seq element, we store QKV in that order.
-        int64_t row_offset = (int64_t)row * params.qkv_stride_in_bytes;
-        // Add the block index.
-        row_offset += (int64_t)((binfo.sum_s * NUM_MATS + qkv_offset) * binfo.h + binfo.bidh) * BYTES_PER_ROW;
-        // Assemble the final pointer.
-        qkv_ptr_ += row_offset + col * BYTES_PER_LDG;
-    }
-    // Store data to shared memory.
-    template< typename Smem_tile >
-    inline __device__ void commit(Smem_tile &smem_tile) {
-        smem_tile.store(fetch_);
-    }
-    // Load data from memory.
-    template< typename Smem_tile >
-    inline __device__ void load(Smem_tile &smem_tile) {
-        const void *ptrs[LDGS];
-        uint32_t preds[LDGS];
-        #pragma unroll
-        for( int ii = 0; ii < LDGS; ++ii ) {
-            ptrs[ii] = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
-            preds[ii] = ((row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen));
-            fetch_[ii] = make_uint4(0, 0, 0, 0);
-        }
-        // not packing predicates removes restrictions (e.g. FP16 384, 4 warps)
-        Ldg_functor<uint4, LDGS> fct(fetch_, ptrs);
-        #pragma unroll
-        for( int ii = 0; ii < LDGS; ++ii ) {
-            fct.load(ii, preds[ii]);
-        }
-    }
-    // Store data to memory.
-    inline __device__ void store(const uint4 (&data)[LDGS]) {
-        #pragma unroll
-        for( int ii = 0; ii < LDGS; ++ii ) {
-            char *ptr = qkv_ptr_ + (int64_t)ii * ROWS_PER_LDG * params_qkv_stride_in_bytes_;
-            if( (row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen) ) {
-                fmha::stg(ptr, data[ii]);
-            }
-        }
-    }
-    // Move the pointer to the next location.
-    inline __device__ void move() {
-        qkv_ptr_ += (int64_t)ROWS * params_qkv_stride_in_bytes_;
-        actual_seqlen -= ROWS;
-    }
-    inline __device__ void move(int steps) {
-        qkv_ptr_ += (int64_t)ROWS * params_qkv_stride_in_bytes_ * steps;
-        actual_seqlen -= ROWS * steps;
-    }
-    // The stride between rows for the QKV matrice.
-    int64_t params_qkv_stride_in_bytes_;
-    // The pointer.
-    char *qkv_ptr_;
-    // The fetch registers.
-    uint4 fetch_[LDGS];
-    // Keep track of the row the thread is processing as we move the tile.
-    int row_;
-    // The length of the sequence loaded by that memory tile.
-    int actual_seqlen;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename Cta_tile >
-struct Gmem_tile_o {
-    // The mma tile.
-    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
-    // The size of each element.
-    enum { BYTES_PER_ELEMENT = 2 };
-    // The size of a row in bytes.
-    enum { BYTES_PER_ROW = Cta_tile::N * BYTES_PER_ELEMENT };
-    // The number of threads to store a "row" of the matrix.
-    enum { THREADS_PER_ROW = 16 };
-    // The size of each STG.
-    enum { BYTES_PER_STG = BYTES_PER_ROW / THREADS_PER_ROW };
-    // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
-    enum { ROWS = Cta_tile::M };
-    // The number of "rows" stored per iteration of the loop. The output of 1 MMA.
-    enum { ROWS_PER_LOOP = ROWS <= 64 ? ROWS : (int)Mma_tile::M_PER_MMA_PER_CTA };
-    // The number of outter loop for the stores.
-    enum { LOOPS = ROWS / ROWS_PER_LOOP };
-    // The number of "rows" stored per STG.
-    enum { ROWS_PER_STG = Cta_tile::THREADS_PER_CTA / THREADS_PER_ROW };
-    // Do we have to guard against partial writes/reads.
-    enum { HAS_INCOMPLETE_STG = Cta_tile::M % ROWS_PER_STG != 0 };
-    // The number of STGs needed to store a chunk of the Q matrix.
-    enum { STGS_PER_LOOP = fmha::Div_up<ROWS_PER_LOOP, ROWS_PER_STG>::VALUE };
-    // The number of STGs needed to store a chunk of the Q matrix in total.
-    enum { STGS = STGS_PER_LOOP * LOOPS };
-    // Ctor.
-    template<typename Params, typename BInfo>
-    inline __device__ Gmem_tile_o(const Params &params, const BInfo &binfo, int tidx)
-        : params_o_stride_in_bytes_(params.o_stride_in_bytes)
-        , actual_seqlen_(binfo.actual_seqlen)
-        , o_ptr_(reinterpret_cast<char *>(params.o_ptr)) {
-        // Compute the position in the sequence (within the CTA for the moment).
-        int row = tidx / THREADS_PER_ROW;
-        // Compute the position of the thread in the row.
-        int col = tidx % THREADS_PER_ROW;
-        // Store the row as we need it to disable loads.
-        row_ = row;
-        // The row offset in the batched GEMM.
-        int64_t row_offset = (int64_t)row * params.o_stride_in_bytes + binfo.bidx * BYTES_PER_ROW;
-        // Assemble the final pointer.
-        o_ptr_ += row_offset + col * BYTES_PER_STG;
-        // Is that thread active on the last STG?
-        if( HAS_INCOMPLETE_STG ) {
-            is_active_for_last_stg_ = row + (STGS - 1) * ROWS_PER_STG < Cta_tile::M;
-        }
-    }
-    // Store data to global memory.
-    inline __device__ void store(const uint4 (&src)[STGS_PER_LOOP], int mi) {
-        #pragma unroll
-        for( int ii = 0; ii < STGS_PER_LOOP; ++ii ) {
-            int jj = mi * STGS_PER_LOOP + ii;
-            if( this->row_ + jj * ROWS_PER_STG >= this->actual_seqlen_ ) {
-                break;
-            }
-            float x = reinterpret_cast<const float &>(src[ii].x);
-            float y = reinterpret_cast<const float &>(src[ii].y);
-            float z = reinterpret_cast<const float &>(src[ii].z);
-            float w = reinterpret_cast<const float &>(src[ii].w);
-            uint2 out = float4_to_half4(x, y, z, w);
-            if( !HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_) ) {
-                fmha::stg(this->o_ptr_ + jj * ROWS_PER_STG * this->params_o_stride_in_bytes_, out);
-            }
-        }
-    }
-    // Move the pointer to the next location.
-    inline __device__ void move() {
-        row_ += ROWS;
-        o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_;
-    }
-    inline __device__ void move(const int steps) {
-        row_ += ROWS * steps;
-        o_ptr_ += (int64_t)ROWS * params_o_stride_in_bytes_ * steps;
-    }
-    // The stride between rows for the QKV matrice.
-    int64_t params_o_stride_in_bytes_;
-    // The pointer.
-    char *o_ptr_;
-    // Is the thread active for the last STG?
-    int is_active_for_last_stg_;
-    // Keep track of the row to disable loads.
-    int row_;
-    // The length of the sequence loaded by that memory tile.
-    int actual_seqlen_;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename Cta_tile, int BYTES_PER_ELEMENT >
-struct Gmem_tile_mma_sd {
-    // The mma tile.
-    using Mma_tile = fmha::Hmma_tile<Cta_tile>;
-    // Each STG stores 8 elements.
-    enum { BYTES_PER_STG = BYTES_PER_ELEMENT * 8 };
-    // The number of MMAs in the M dimension.
-    enum { MMAS_M = Mma_tile::MMAS_M };
-    // The number of MMAs in the N dimension.
-    enum { MMAS_N = Mma_tile::MMAS_N };
-    // The number of rows computed per MMA per thread block.
-    enum { M_PER_MMA_PER_CTA = Mma_tile::M_PER_MMA_PER_CTA };
-    // The number of cols computed per MMA per thread block.
-    enum { N_PER_MMA_PER_CTA = Mma_tile::N_PER_MMA_PER_CTA };
-    // The number of threads per block.
-    enum { THREADS_PER_CTA = Cta_tile::THREADS_PER_CTA };
-    // The size of each row in bytes. I.e. how many bytes are stored per STG.
-    enum { BYTES_PER_ROW = THREADS_PER_CTA * BYTES_PER_STG };
-    // The fixed sequence length.
-    enum { SEQLEN = Cta_tile::N };
-    // The distance between two blocks (in bytes).
-    enum { BLOCK_STRIDE_BYTES = SEQLEN * SEQLEN * BYTES_PER_ELEMENT };
-    // The distance between elements stored per loop (in bytes).
-    enum { LOOP_STRIDE_BYTES = MMAS_M * MMAS_N * BYTES_PER_ROW };
-    // The type of elements stored per STG.
-    using Type = typename fmha::Uint_from_size_in_bytes<BYTES_PER_STG>::Type;
-    // Ctor.
-    template<typename Params>
-    inline __device__ Gmem_tile_mma_sd(void *ptr, const Params &params, const int bidb, const int bidh, const int tidx)
-        : ptr_(static_cast<char *>(ptr)) {
-        // The block index.
-        size_t bidx = bidb * params.h + bidh;
-        // Set store location for each thread at the beginning of the loop
-        ptr_ += bidx * BLOCK_STRIDE_BYTES + tidx * BYTES_PER_STG;
-    }
-    // Store to global memory.
-    inline __device__ void store(const Type &data, const int mi, const int ni) {
-        size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
-        fmha::stg(ptr_ + offset, data);
-    }
-    // Load from global memory.
-    inline __device__ void load(Type &data, const int mi, const int ni) {
-        size_t offset = (mi * MMAS_N + ni) * BYTES_PER_ROW;
-        fmha::ldg(data, ptr_ + offset);
-    }
-    // Move to the next tile.
-    inline __device__ void move() {
-        ptr_ += LOOP_STRIDE_BYTES;
-    }
-    inline __device__ void move(const int steps) {
-        ptr_ += LOOP_STRIDE_BYTES * steps;
-    }
-    // The pointer in global memory.
-    char *ptr_;
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename Cta_tile, typename Base = Gmem_tile_mma_sd<Cta_tile, sizeof(uint16_t)> >
-struct Gmem_tile_mma_s : public Base {
-    // The number of mmas in the vertical dimension.
-    enum { M = Base::MMAS_M };
-    // The number of mmas in the horizontal dimension.
-    enum { N = Base::MMAS_N };
-    // The type of the vectors stored by each STG.
-    using Type = typename Base::Type;
-    // Ctor.
-    template< typename Params, typename Block_info >
-    inline __device__ Gmem_tile_mma_s(const Params &params, const Block_info& binfo, const int tidx)
-        : Base(params.s_ptr, params, binfo.bidb, binfo.bidh, tidx) {
-    }
-    // Store to global memory.
-    template<typename Mask>
-    inline __device__ void store(const float (&softmax)[2 * M][4 * N], const Mask &mask) {
-        #pragma unroll
-        for( int mi = 0; mi < M; mi++ ) {
-            #pragma unroll
-            for( int ni = 0; ni < N; ni++ ) {
-                float tmp00 = softmax[2 * mi + 0][4 * ni + 0];
-                float tmp01 = softmax[2 * mi + 0][4 * ni + 1];
-                float tmp02 = softmax[2 * mi + 0][4 * ni + 2];
-                float tmp03 = softmax[2 * mi + 0][4 * ni + 3];
-                float tmp10 = softmax[2 * mi + 1][4 * ni + 0];
-                float tmp11 = softmax[2 * mi + 1][4 * ni + 1];
-                float tmp12 = softmax[2 * mi + 1][4 * ni + 2];
-                float tmp13 = softmax[2 * mi + 1][4 * ni + 3];
-                uint4 dst;
-                dst.x = fmha::float2_to_half2(tmp00, tmp01);
-                dst.y = fmha::float2_to_half2(tmp02, tmp03);
-                dst.z = fmha::float2_to_half2(tmp10, tmp11);
-                dst.w = fmha::float2_to_half2(tmp12, tmp13);
-                if( mask.is_valid(mi, ni, 0, 0) ) {
-                    Base::store(dst, mi, ni);
-                }
-            }
-        }
-    }
-    // Store to global memory.
-    template<typename Mask, typename Fragment>
-    inline __device__ void store(const Fragment (&frag)[N][M], const Mask& mask){
-        #pragma unroll
-        for( int mi = 0; mi < M; mi++ ) {
-            #pragma unroll
-            for( int ni = 0; ni < N; ni++ ) {
-                uint4 dst;
-                dst.x = frag[ni][mi].reg(0);
-                dst.y = frag[ni][mi].reg(2);
-                dst.z = frag[ni][mi].reg(1);
-                dst.w = frag[ni][mi].reg(3);
-                if( mask.any_valid(mi, ni) ) {
-                    Base::store(dst, mi, ni);
-                }
-            }
-        }
-    }
-    // Load from global memory.
-    template<typename Mask>
-    inline __device__ void load(uint4 (&regs)[M][N], const Mask &mask) {
-        #pragma unroll
-        for( int mi = 0; mi < M; mi++ ) {
-            #pragma unroll
-            for( int ni = 0; ni < N; ni++ ) {
-                regs[mi][ni] = make_uint4(0, 0, 0, 0);
-                if( mask.any_valid(mi, ni) ) {
-                    Base::load(regs[mi][ni], mi, ni);
-                }
-            }
-        }
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-    // The dimensions of the tile computed by the CTA.
-    typename Cta_tile,
-    // The base class.
-    typename Base = fmha::Gmem_tile_qkv<Cta_tile, fmha::BITS_PER_ELEMENT_A, Cta_tile::M, Cta_tile::K>
->
-struct Gmem_tile_dout : public Base {
-    // Ctor.
-    template<typename Params, typename BInfo>
-    inline __device__ Gmem_tile_dout(const Params &params, const BInfo &binfo, int tidx)
-        : Base(params, 0, binfo, tidx) {
-        this->qkv_ptr_ = reinterpret_cast<char *>(params.o_ptr);
-        this->params_qkv_stride_in_bytes_ = params.o_stride_in_bytes;  // needed for move
-        // Compute the position of the thread in the row.
-        int col = tidx % Base::THREADS_PER_ROW;
-        // The row offset in the batched GEMM. For each seq element, we store O in that order.
-        int64_t row_offset = (int64_t)this->row_ * params.o_stride_in_bytes + binfo.bidx * Base::BYTES_PER_ROW;
-        // Assemble the final pointer.
-        this->qkv_ptr_ += row_offset + col * Base::BYTES_PER_LDG;
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template< typename Cta_tile, typename Base = fmha::Gmem_tile_o<Cta_tile> >
-struct Gmem_tile_dq : public Base {
-    // Ctor.
-    template<typename Params, typename BInfo>
-    inline __device__ Gmem_tile_dq(const Params &params, const BInfo &binfo, int tidx)
-        : Base(params, binfo, tidx) {
-        this->o_ptr_ = reinterpret_cast<char *>(params.dqkv_ptr);
-        this->params_o_stride_in_bytes_ = params.qkv_stride_in_bytes;  // needed for move
-        // Compute the position of the thread in the row.
-        int col = tidx % Base::THREADS_PER_ROW;
-        // The row offset in the batched GEMM. For each seq element, we store O in that order.
-        int64_t row_offset = (int64_t)this->row_ * params.qkv_stride_in_bytes +
-                             (binfo.sum_s * 3 * binfo.h + binfo.bidh) * Base::BYTES_PER_ROW;
-        // Assemble the final pointer.
-        this->o_ptr_ += row_offset + col * Base::BYTES_PER_STG;
-    }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace fmha